1 // Written in the D programming language.
2 
3 /++
4     $(P The `std.uni` module provides an implementation
5     of fundamental Unicode algorithms and data structures.
6     This doesn't include UTF encoding and decoding primitives,
7     see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf)
8     for this functionality. )
9 
10 $(SCRIPT inhibitQuickIndex = 1;)
11 $(DIVC quickindex,
12 $(BOOKTABLE,
13 $(TR $(TH Category) $(TH Functions))
14 $(TR $(TD Decode) $(TD
15     $(LREF byCodePoint)
16     $(LREF byGrapheme)
17     $(LREF decodeGrapheme)
18     $(LREF graphemeStride)
19 ))
20 $(TR $(TD Comparison) $(TD
21     $(LREF icmp)
22     $(LREF sicmp)
23 ))
24 $(TR $(TD Classification) $(TD
25     $(LREF isAlpha)
26     $(LREF isAlphaNum)
27     $(LREF isCodepointSet)
28     $(LREF isControl)
29     $(LREF isFormat)
30     $(LREF isGraphical)
31     $(LREF isIntegralPair)
32     $(LREF isMark)
33     $(LREF isNonCharacter)
34     $(LREF isNumber)
35     $(LREF isPrivateUse)
36     $(LREF isPunctuation)
37     $(LREF isSpace)
38     $(LREF isSurrogate)
39     $(LREF isSurrogateHi)
40     $(LREF isSurrogateLo)
41     $(LREF isSymbol)
42     $(LREF isWhite)
43 ))
44 $(TR $(TD Normalization) $(TD
45     $(LREF NFC)
46     $(LREF NFD)
47     $(LREF NFKD)
48     $(LREF NormalizationForm)
49     $(LREF normalize)
50 ))
51 $(TR $(TD Decompose) $(TD
52     $(LREF decompose)
53     $(LREF decomposeHangul)
54     $(LREF UnicodeDecomposition)
55 ))
56 $(TR $(TD Compose) $(TD
57     $(LREF compose)
58     $(LREF composeJamo)
59 ))
60 $(TR $(TD Sets) $(TD
61     $(LREF CodepointInterval)
62     $(LREF CodepointSet)
63     $(LREF InversionList)
64     $(LREF unicode)
65 ))
66 $(TR $(TD Trie) $(TD
67     $(LREF codepointSetTrie)
68     $(LREF CodepointSetTrie)
69     $(LREF codepointTrie)
70     $(LREF CodepointTrie)
71     $(LREF toTrie)
72     $(LREF toDelegate)
73 ))
74 $(TR $(TD Casing) $(TD
75     $(LREF asCapitalized)
76     $(LREF asLowerCase)
77     $(LREF asUpperCase)
78     $(LREF isLower)
79     $(LREF isUpper)
80     $(LREF toLower)
81     $(LREF toLowerInPlace)
82     $(LREF toUpper)
83     $(LREF toUpperInPlace)
84 ))
85 $(TR $(TD Utf8Matcher) $(TD
86     $(LREF isUtfMatcher)
87     $(LREF MatcherConcept)
88     $(LREF utfMatcher)
89 ))
90 $(TR $(TD Separators) $(TD
91     $(LREF lineSep)
92     $(LREF nelSep)
93     $(LREF paraSep)
94 ))
95 $(TR $(TD Building blocks) $(TD
96     $(LREF allowedIn)
97     $(LREF combiningClass)
98     $(LREF Grapheme)
99 ))
100 ))
101 
102     $(P All primitives listed operate on Unicode characters and
103         sets of characters. For functions which operate on ASCII characters
104         and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
105         For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
106         used throughout this module see the $(S_LINK Terminology, terminology) section
107         below.
108     )
109     $(P The focus of this module is the core needs of developing Unicode-aware
110         applications. To that effect it provides the following optimized primitives:
111     )
112     $(UL
113         $(LI Character classification by category and common properties:
114             $(LREF isAlpha), $(LREF isWhite) and others.
115         )
116         $(LI
117             Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
118         )
119         $(LI
120             Converting text to any of the four normalization forms via $(LREF normalize).
121         )
122         $(LI
123             Decoding ($(LREF decodeGrapheme))  and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
124             by user-perceived characters, that is by $(LREF Grapheme) clusters.
125         )
126         $(LI
127             Decomposing and composing of individual character(s) according to canonical
128             or compatibility rules, see $(LREF compose) and $(LREF decompose),
129             including the specific version for Hangul syllables $(LREF composeJamo)
130             and $(LREF decomposeHangul).
131         )
132     )
133     $(P It's recognized that an application may need further enhancements
134         and extensions, such as less commonly known algorithms,
135         or tailoring existing ones for region specific needs. To help users
136         with building any extra functionality beyond the core primitives,
137         the module provides:
138     )
139     $(UL
140         $(LI
141             $(LREF CodepointSet), a type for easy manipulation of sets of characters.
142             Besides the typical set algebra it provides an unusual feature:
143             a D source code generator for detection of $(CODEPOINTS) in this set.
144             This is a boon for meta-programming parser frameworks,
145             and is used internally to power classification in small
146             sets like $(LREF isWhite).
147         )
148         $(LI
149             A way to construct optimal packed multi-stage tables also known as a
150             special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
151             The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
152             construct custom tries that map dchar to value.
153             The end result is a fast and predictable $(BIGOH 1) lookup that powers
154             functions like $(LREF isAlpha) and $(LREF combiningClass),
155             but for user-defined data sets.
156         )
157         $(LI
158             A useful technique for Unicode-aware parsers that perform
159             character classification of encoded $(CODEPOINTS)
160             is to avoid unnecassary decoding at all costs.
161             $(LREF utfMatcher) provides an improvement over the usual workflow
162             of decode-classify-process, combining the decoding and classification
163             steps. By extracting necessary bits directly from encoded
164             $(S_LINK Code unit, code units) matchers achieve
165             significant performance improvements. See $(LREF MatcherConcept) for
166             the common interface of UTF matchers.
167         )
168         $(LI
169             Generally useful building blocks for customized normalization:
170             $(LREF combiningClass) for querying combining class
171             and $(LREF allowedIn) for testing the Quick_Check
172             property of a given normalization form.
173         )
174         $(LI
175             Access to a large selection of commonly used sets of $(CODEPOINTS).
176             $(S_LINK Unicode properties, Supported sets) include Script,
177             Block and General Category. The exact contents of a set can be
178             observed in the CLDR utility, on the
179             $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
180             of the Unicode website.
181             See $(LREF unicode) for easy and (optionally) compile-time checked set
182             queries.
183         )
184     )
185     $(SECTION Synopsis)
186     ---
187     import std.uni;
188     void main()
189     {
190         // initialize code point sets using script/block or property name
191         // now 'set' contains code points from both scripts.
192         auto set = unicode("Cyrillic") | unicode("Armenian");
193         // same thing but simpler and checked at compile-time
194         auto ascii = unicode.ASCII;
195         auto currency = unicode.Currency_Symbol;
196 
197         // easy set ops
198         auto a = set & ascii;
199         assert(a.empty); // as it has no intersection with ascii
200         a = set | ascii;
201         auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
202 
203         // some properties of code point sets
204         assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
205         // testing presence of a code point in a set
206         // is just fine, it is O(logN)
207         assert(!b['$']);
208         assert(!b['\u058F']); // Armenian dram sign
209         assert(b['¥']);
210 
211         // building fast lookup tables, these guarantee O(1) complexity
212         // 1-level Trie lookup table essentially a huge bit-set ~262Kb
213         auto oneTrie = toTrie!1(b);
214         // 2-level far more compact but typically slightly slower
215         auto twoTrie = toTrie!2(b);
216         // 3-level even smaller, and a bit slower yet
217         auto threeTrie = toTrie!3(b);
218         assert(oneTrie['£']);
219         assert(twoTrie['£']);
220         assert(threeTrie['£']);
221 
222         // build the trie with the most sensible trie level
223         // and bind it as a functor
224         auto cyrillicOrArmenian = toDelegate(set);
225         auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
226         assert(balance == "ընկեր!");
227         // compatible with bool delegate(dchar)
228         bool delegate(dchar) bindIt = cyrillicOrArmenian;
229 
230         // Normalization
231         string s = "Plain ascii (and not only), is always normalized!";
232         assert(s is normalize(s));// is the same string
233 
234         string nonS = "A\u0308ffin"; // A ligature
235         auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
236         assert(nS == "Äffin");
237         assert(nS != nonS);
238         string composed = "Äffin";
239 
240         assert(normalize!NFD(composed) == "A\u0308ffin");
241         // to NFKD, compatibility decomposition useful for fuzzy matching/searching
242         assert(normalize!NFKD("2¹⁰") == "210");
243     }
244     ---
245     $(SECTION Terminology)
246     $(P The following is a list of important Unicode notions
247     and definitions. Any conventions used specifically in this
248     module alone are marked as such. The descriptions are based on the formal
249     definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
250     chapter three of The Unicode Standard Core Specification.)
251     )
252     $(P $(DEF Abstract character) A unit of information used for the organization,
253         control, or representation of textual data.
254         Note that:
255         $(UL
256             $(LI When representing data, the nature of that data
257                 is generally symbolic as opposed to some other
258                 kind of data (for example, visual).
259             )
260              $(LI An abstract character has no concrete form
261                 and should not be confused with a $(S_LINK Glyph, glyph).
262             )
263             $(LI An abstract character does not necessarily
264                 correspond to what a user thinks of as a “character”
265                 and should not be confused with a $(LREF Grapheme).
266             )
267             $(LI The abstract characters encoded (see Encoded character)
268                 are known as Unicode abstract characters.
269             )
270             $(LI Abstract characters not directly
271                 encoded by the Unicode Standard can often be
272                 represented by the use of combining character sequences.
273             )
274         )
275     )
276     $(P $(DEF Canonical decomposition)
277         The decomposition of a character or character sequence
278         that results from recursively applying the canonical
279         mappings found in the Unicode Character Database
280         and these described in Conjoining Jamo Behavior
281         (section 12 of
282         $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
283     )
284     $(P $(DEF Canonical composition)
285         The precise definition of the Canonical composition
286         is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
287         Unicode Conformance) section 11.
288         Informally it's the process that does the reverse of the canonical
289         decomposition with the addition of certain rules
290         that e.g. prevent legacy characters from appearing in the composed result.
291     )
292     $(P $(DEF Canonical equivalent)
293         Two character sequences are said to be canonical equivalents if
294         their full canonical decompositions are identical.
295     )
296     $(P $(DEF Character) Typically differs by context.
297         For the purpose of this documentation the term $(I character)
298         implies $(I encoded character), that is, a code point having
299         an assigned abstract character (a symbolic meaning).
300     )
301     $(P $(DEF Code point) Any value in the Unicode codespace;
302         that is, the range of integers from 0 to 10FFFF (hex).
303         Not all code points are assigned to encoded characters.
304     )
305     $(P $(DEF Code unit) The minimal bit combination that can represent
306         a unit of encoded text for processing or interchange.
307         Depending on the encoding this could be:
308         8-bit code units in the UTF-8 (`char`),
309         16-bit code units in the UTF-16 (`wchar`),
310         and 32-bit code units in the UTF-32 (`dchar`).
311         $(I Note that in UTF-32, a code unit is a code point
312         and is represented by the D `dchar` type.)
313     )
314     $(P $(DEF Combining character) A character with the General Category
315         of Combining Mark(M).
316         $(UL
317             $(LI All characters with non-zero canonical combining class
318             are combining characters, but the reverse is not the case:
319             there are combining characters with a zero combining class.
320             )
321             $(LI These characters are not normally used in isolation
322             unless they are being described. They include such characters
323             as accents, diacritics, Hebrew points, Arabic vowel signs,
324             and Indic matras.
325             )
326         )
327     )
328     $(P $(DEF Combining class)
329         A numerical value used by the Unicode Canonical Ordering Algorithm
330         to determine which sequences of combining marks are to be
331         considered canonically equivalent and  which are not.
332     )
333     $(P $(DEF Compatibility decomposition)
334         The decomposition of a character or character sequence that results
335         from recursively applying both the compatibility mappings and
336         the canonical mappings found in the Unicode Character Database, and those
337         described in Conjoining Jamo Behavior no characters
338         can be further decomposed.
339     )
340     $(P $(DEF Compatibility equivalent)
341         Two character sequences are said to be compatibility
342         equivalents if their full compatibility decompositions are identical.
343     )
344     $(P $(DEF Encoded character) An association (or mapping)
345         between an abstract character and a code point.
346     )
347     $(P $(DEF Glyph) The actual, concrete image of a glyph representation
348         having been rasterized or otherwise imaged onto some display surface.
349     )
350     $(P $(DEF Grapheme base) A character with the property
351         Grapheme_Base, or any standard Korean syllable block.
352     )
353     $(P $(DEF Grapheme cluster) Defined as the text between
354         grapheme boundaries  as specified by Unicode Standard Annex #29,
355         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
356         Important general properties of a grapheme:
357         $(UL
358             $(LI The grapheme cluster represents a horizontally segmentable
359             unit of text, consisting of some grapheme base (which may
360             consist of a Korean syllable) together with any number of
361             nonspacing marks applied to it.
362             )
363             $(LI  A grapheme cluster typically starts with a grapheme base
364             and then extends across any subsequent sequence of nonspacing marks.
365             A grapheme cluster is most directly relevant to text rendering and
366             processes such as cursor placement and text selection in editing,
367             but may also be relevant to comparison and searching.
368             )
369             $(LI For many processes, a grapheme cluster behaves as if it was a
370             single character with the same properties as its grapheme base.
371             Effectively, nonspacing marks apply $(I graphically) to the base,
372             but do not change its properties.
373             )
374         )
375         $(P This module defines a number of primitives that work with graphemes:
376         $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
377         All of them are using $(I extended grapheme) boundaries
378         as defined in the aforementioned standard annex.
379         )
380     )
381     $(P $(DEF Nonspacing mark) A combining character with the
382         General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
383     )
384     $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
385     )
386     $(SECTION Normalization)
387     $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
388         or $(S_LINK Compatibility equivalent, compatibility equivalent)
389         characters in the Unicode Standard make it necessary to have a full, formal
390         definition of equivalence for Unicode strings.
391         String equivalence is determined by a process called normalization,
392         whereby strings are converted into forms which are compared
393         directly for identity. This is the primary goal of the normalization process,
394         see the function $(LREF normalize) to convert into any of
395         the four defined forms.
396     )
397     $(P A very important attribute of the Unicode Normalization Forms
398         is that they must remain stable between versions of the Unicode Standard.
399         A Unicode string normalized to a particular Unicode Normalization Form
400         in one version of the standard is guaranteed to remain in that Normalization
401         Form for implementations of future versions of the standard.
402     )
403     $(P The Unicode Standard specifies four normalization forms.
404         Informally, two of these forms are defined by maximal decomposition
405         of equivalent sequences, and two of these forms are defined
406         by maximal $(I composition) of equivalent sequences.
407             $(UL
408             $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
409                 canonical decomposition) of a character sequence.)
410             $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
411                 compatibility decomposition) of a character sequence.)
412             $(LI Normalization Form C (NFC): The canonical composition of the
413                 $(S_LINK Canonical decomposition, canonical decomposition)
414                 of a coded character sequence.)
415             $(LI Normalization Form KC (NFKC): The canonical composition
416             of the $(S_LINK Compatibility decomposition,
417                 compatibility decomposition) of a character sequence)
418             )
419     )
420     $(P The choice of the normalization form depends on the particular use case.
421         NFC is the best form for general text, since it's more compatible with
422         strings converted from legacy encodings. NFKC is the preferred form for
423         identifiers, especially where there are security concerns. NFD and NFKD
424         are the most useful for internal processing.
425     )
426     $(SECTION Construction of lookup tables)
427     $(P The Unicode standard describes a set of algorithms that
428         depend on having the ability to quickly look up various properties
429         of a code point. Given the codespace of about 1 million $(CODEPOINTS),
430         it is not a trivial task to provide a space-efficient solution for
431         the multitude of properties.
432     )
433     $(P Common approaches such as hash-tables or binary search over
434         sorted code point intervals (as in $(LREF InversionList)) are insufficient.
435         Hash-tables have enormous memory footprint and binary search
436         over intervals is not fast enough for some heavy-duty algorithms.
437     )
438     $(P The recommended solution (see Unicode Implementation Guidelines)
439         is using multi-stage tables that are an implementation of the
440         $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
441         keys and a fixed number of stages. For the remainder of the section
442         this will be called a fixed trie. The following describes a particular
443         implementation that is aimed for the speed of access at the expense
444         of ideal size savings.
445     )
446     $(P Taking a 2-level Trie as an example the principle of operation is as follows.
447         Split the number of bits in a key (code point, 21 bits) into 2 components
448         (e.g. 15 and 8).  The first is the number of bits in the index of the trie
449          and the other is number of bits in each page of the trie.
450         The layout of the trie is then an array of size 2^^bits-of-index followed
451         an array of memory chunks of size 2^^bits-of-page/bits-per-element.
452     )
453     $(P The number of pages is variable (but not less then 1)
454         unlike the number of entries in the index. The slots of the index
455         all have to contain a number of a page that is present. The lookup is then
456         just a couple of operations - slice the upper bits,
457         lookup an index for these, take a page at this index and use
458         the lower bits as an offset within this page.
459 
460         Assuming that pages are laid out consequently
461         in one array at `pages`, the pseudo-code is:
462     )
463     ---
464     auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
465     pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
466     ---
467     $(P Where if `elemsPerPage` is a power of 2 the whole process is
468         a handful of simple instructions and 2 array reads. Subsequent levels
469         of the trie are introduced by recursing on this notion - the index array
470         is treated as values. The number of bits in index is then again
471         split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
472     )
473 
474     $(P For completeness a level 1 trie is simply an array.
475         The current implementation takes advantage of bit-packing values
476         when the range is known to be limited in advance (such as `bool`).
477         See also $(LREF BitPacked) for enforcing it manually.
478         The major size advantage however comes from the fact
479         that multiple $(B identical pages on every level are merged) by construction.
480     )
481     $(P The process of constructing a trie is more involved and is hidden from
482         the user in a form of the convenience functions $(LREF codepointTrie),
483         $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
484         In general a set or built-in AA with `dchar` type
485         can be turned into a trie. The trie object in this module
486         is read-only (immutable); it's effectively frozen after construction.
487     )
488     $(SECTION Unicode properties)
489     $(P This is a full list of Unicode properties accessible through $(LREF unicode)
490         with specific helpers per category nested within. Consult the
491         $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
492         when in doubt about the contents of a particular set.
493     )
494     $(P General category sets listed below are only accessible with the
495         $(LREF unicode) shorthand accessor.)
496         $(BOOKTABLE $(B General category ),
497              $(TR $(TH Abb.) $(TH Long form)
498                 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
499             $(TR $(TD L) $(TD Letter)
500                 $(TD Cn) $(TD Unassigned)  $(TD Po) $(TD Other_Punctuation))
501             $(TR $(TD Ll) $(TD Lowercase_Letter)
502                 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
503             $(TR $(TD Lm) $(TD Modifier_Letter)
504                 $(TD Cs) $(TD Surrogate)   $(TD S) $(TD Symbol))
505             $(TR $(TD Lo) $(TD Other_Letter)
506                 $(TD N) $(TD Number)  $(TD Sc) $(TD Currency_Symbol))
507             $(TR $(TD Lt) $(TD Titlecase_Letter)
508               $(TD Nd) $(TD Decimal_Number)  $(TD Sk) $(TD Modifier_Symbol))
509             $(TR $(TD Lu) $(TD Uppercase_Letter)
510               $(TD Nl) $(TD Letter_Number)   $(TD Sm) $(TD Math_Symbol))
511             $(TR $(TD M) $(TD Mark)
512               $(TD No) $(TD Other_Number)    $(TD So) $(TD Other_Symbol))
513             $(TR $(TD Mc) $(TD Spacing_Mark)
514               $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
515             $(TR $(TD Me) $(TD Enclosing_Mark)
516               $(TD Pc) $(TD Connector_Punctuation)   $(TD Zl) $(TD Line_Separator))
517             $(TR $(TD Mn) $(TD Nonspacing_Mark)
518               $(TD Pd) $(TD Dash_Punctuation)    $(TD Zp) $(TD Paragraph_Separator))
519             $(TR $(TD C) $(TD Other)
520               $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
521             $(TR $(TD Cc) $(TD Control) $(TD Pf)
522               $(TD Final_Punctuation)   $(TD -) $(TD Any))
523             $(TR $(TD Cf) $(TD Format)
524               $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
525     )
526     $(P Sets for other commonly useful properties that are
527         accessible with $(LREF unicode):)
528         $(BOOKTABLE $(B Common binary properties),
529             $(TR $(TH Name) $(TH Name) $(TH Name))
530             $(TR $(TD Alphabetic)  $(TD Ideographic) $(TD Other_Uppercase))
531             $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
532             $(TR $(TD Bidi_Control)    $(TD ID_Start)    $(TD Pattern_White_Space))
533             $(TR $(TD Cased)   $(TD IDS_Trinary_Operator)    $(TD Quotation_Mark))
534             $(TR $(TD Case_Ignorable)  $(TD Join_Control)    $(TD Radical))
535             $(TR $(TD Dash)    $(TD Logical_Order_Exception) $(TD Soft_Dotted))
536             $(TR $(TD Default_Ignorable_Code_Point)    $(TD Lowercase)   $(TD STerm))
537             $(TR $(TD Deprecated)  $(TD Math)    $(TD Terminal_Punctuation))
538             $(TR $(TD Diacritic)   $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
539             $(TR $(TD Extender)    $(TD Other_Alphabetic)    $(TD Uppercase))
540             $(TR $(TD Grapheme_Base)   $(TD Other_Default_Ignorable_Code_Point)  $(TD Variation_Selector))
541             $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend)   $(TD White_Space))
542             $(TR $(TD Grapheme_Link)   $(TD Other_ID_Continue)   $(TD XID_Continue))
543             $(TR $(TD Hex_Digit)   $(TD Other_ID_Start)  $(TD XID_Start))
544             $(TR $(TD Hyphen)  $(TD Other_Lowercase) )
545             $(TR $(TD ID_Continue) $(TD Other_Math)  )
546     )
547     $(P Below is the table with block names accepted by $(LREF unicode.block).
548         Note that the shorthand version $(LREF unicode) requires "In"
549         to be prepended to the names of blocks so as to disambiguate
550         scripts and blocks.
551     )
552     $(BOOKTABLE $(B Blocks),
553         $(TR $(TD Aegean Numbers)    $(TD Ethiopic Extended) $(TD Mongolian))
554         $(TR $(TD Alchemical Symbols)    $(TD Ethiopic Extended-A)   $(TD Musical Symbols))
555         $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement)   $(TD Myanmar))
556         $(TR $(TD Ancient Greek Musical Notation)    $(TD General Punctuation)   $(TD Myanmar Extended-A))
557         $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes)  $(TD New Tai Lue))
558         $(TR $(TD Ancient Symbols)   $(TD Georgian)  $(TD NKo))
559         $(TR $(TD Arabic)    $(TD Georgian Supplement)   $(TD Number Forms))
560         $(TR $(TD Arabic Extended-A) $(TD Glagolitic)    $(TD Ogham))
561         $(TR $(TD Arabic Mathematical Alphabetic Symbols)    $(TD Gothic)    $(TD Ol Chiki))
562         $(TR $(TD Arabic Presentation Forms-A)   $(TD Greek and Coptic)  $(TD Old Italic))
563         $(TR $(TD Arabic Presentation Forms-B)   $(TD Greek Extended)    $(TD Old Persian))
564         $(TR $(TD Arabic Supplement) $(TD Gujarati)  $(TD Old South Arabian))
565         $(TR $(TD Armenian)  $(TD Gurmukhi)  $(TD Old Turkic))
566         $(TR $(TD Arrows)    $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
567         $(TR $(TD Avestan)   $(TD Hangul Compatibility Jamo) $(TD Oriya))
568         $(TR $(TD Balinese)  $(TD Hangul Jamo)   $(TD Osmanya))
569         $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A)    $(TD Phags-pa))
570         $(TR $(TD Bamum Supplement)  $(TD Hangul Jamo Extended-B)    $(TD Phaistos Disc))
571         $(TR $(TD Basic Latin)   $(TD Hangul Syllables)  $(TD Phoenician))
572         $(TR $(TD Batak) $(TD Hanunoo)   $(TD Phonetic Extensions))
573         $(TR $(TD Bengali)   $(TD Hebrew)    $(TD Phonetic Extensions Supplement))
574         $(TR $(TD Block Elements)    $(TD High Private Use Surrogates)   $(TD Playing Cards))
575         $(TR $(TD Bopomofo)  $(TD High Surrogates)   $(TD Private Use Area))
576         $(TR $(TD Bopomofo Extended) $(TD Hiragana)  $(TD Rejang))
577         $(TR $(TD Box Drawing)   $(TD Ideographic Description Characters)    $(TD Rumi Numeral Symbols))
578         $(TR $(TD Brahmi)    $(TD Imperial Aramaic)  $(TD Runic))
579         $(TR $(TD Braille Patterns)  $(TD Inscriptional Pahlavi) $(TD Samaritan))
580         $(TR $(TD Buginese)  $(TD Inscriptional Parthian)    $(TD Saurashtra))
581         $(TR $(TD Buhid) $(TD IPA Extensions)    $(TD Sharada))
582         $(TR $(TD Byzantine Musical Symbols) $(TD Javanese)  $(TD Shavian))
583         $(TR $(TD Carian)    $(TD Kaithi)    $(TD Sinhala))
584         $(TR $(TD Chakma)    $(TD Kana Supplement)   $(TD Small Form Variants))
585         $(TR $(TD Cham)  $(TD Kanbun)    $(TD Sora Sompeng))
586         $(TR $(TD Cherokee)  $(TD Kangxi Radicals)   $(TD Spacing Modifier Letters))
587         $(TR $(TD CJK Compatibility) $(TD Kannada)   $(TD Specials))
588         $(TR $(TD CJK Compatibility Forms)   $(TD Katakana)  $(TD Sundanese))
589         $(TR $(TD CJK Compatibility Ideographs)  $(TD Katakana Phonetic Extensions)  $(TD Sundanese Supplement))
590         $(TR $(TD CJK Compatibility Ideographs Supplement)   $(TD Kayah Li)  $(TD Superscripts and Subscripts))
591         $(TR $(TD CJK Radicals Supplement)   $(TD Kharoshthi)    $(TD Supplemental Arrows-A))
592         $(TR $(TD CJK Strokes)   $(TD Khmer) $(TD Supplemental Arrows-B))
593         $(TR $(TD CJK Symbols and Punctuation)   $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
594         $(TR $(TD CJK Unified Ideographs)    $(TD Lao)   $(TD Supplemental Punctuation))
595         $(TR $(TD CJK Unified Ideographs Extension A)    $(TD Latin-1 Supplement)    $(TD Supplementary Private Use Area-A))
596         $(TR $(TD CJK Unified Ideographs Extension B)    $(TD Latin Extended-A)  $(TD Supplementary Private Use Area-B))
597         $(TR $(TD CJK Unified Ideographs Extension C)    $(TD Latin Extended Additional) $(TD Syloti Nagri))
598         $(TR $(TD CJK Unified Ideographs Extension D)    $(TD Latin Extended-B)  $(TD Syriac))
599         $(TR $(TD Combining Diacritical Marks)   $(TD Latin Extended-C)  $(TD Tagalog))
600         $(TR $(TD Combining Diacritical Marks for Symbols)   $(TD Latin Extended-D)  $(TD Tagbanwa))
601         $(TR $(TD Combining Diacritical Marks Supplement)    $(TD Lepcha)    $(TD Tags))
602         $(TR $(TD Combining Half Marks)  $(TD Letterlike Symbols)    $(TD Tai Le))
603         $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
604         $(TR $(TD Control Pictures)  $(TD Linear B Ideograms)    $(TD Tai Viet))
605         $(TR $(TD Coptic)    $(TD Linear B Syllabary)    $(TD Tai Xuan Jing Symbols))
606         $(TR $(TD Counting Rod Numerals) $(TD Lisu)  $(TD Takri))
607         $(TR $(TD Cuneiform) $(TD Low Surrogates)    $(TD Tamil))
608         $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian)    $(TD Telugu))
609         $(TR $(TD Currency Symbols)  $(TD Lydian)    $(TD Thaana))
610         $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
611         $(TR $(TD Cyrillic)  $(TD Malayalam) $(TD Tibetan))
612         $(TR $(TD Cyrillic Extended-A)   $(TD Mandaic)   $(TD Tifinagh))
613         $(TR $(TD Cyrillic Extended-B)   $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
614         $(TR $(TD Cyrillic Supplement)   $(TD Mathematical Operators)    $(TD Ugaritic))
615         $(TR $(TD Deseret)   $(TD Meetei Mayek)  $(TD Unified Canadian Aboriginal Syllabics))
616         $(TR $(TD Devanagari)    $(TD Meetei Mayek Extensions)   $(TD Unified Canadian Aboriginal Syllabics Extended))
617         $(TR $(TD Devanagari Extended)   $(TD Meroitic Cursive)  $(TD Vai))
618         $(TR $(TD Dingbats)  $(TD Meroitic Hieroglyphs)  $(TD Variation Selectors))
619         $(TR $(TD Domino Tiles)  $(TD Miao)  $(TD Variation Selectors Supplement))
620         $(TR $(TD Egyptian Hieroglyphs)  $(TD Miscellaneous Mathematical Symbols-A)  $(TD Vedic Extensions))
621         $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B)  $(TD Vertical Forms))
622         $(TR $(TD Enclosed Alphanumerics)    $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
623         $(TR $(TD Enclosed Alphanumeric Supplement)  $(TD Miscellaneous Symbols and Arrows)  $(TD Yi Radicals))
624         $(TR $(TD Enclosed CJK Letters and Months)   $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
625         $(TR $(TD Enclosed Ideographic Supplement)   $(TD Miscellaneous Technical)   )
626         $(TR $(TD Ethiopic)  $(TD Modifier Tone Letters) )
627     )
628     $(P Below is the table with script names accepted by $(LREF unicode.script)
629         and by the shorthand version $(LREF unicode):)
630         $(BOOKTABLE $(B Scripts),
631             $(TR $(TD Arabic)  $(TD Hanunoo) $(TD Old_Italic))
632             $(TR $(TD Armenian)    $(TD Hebrew)  $(TD Old_Persian))
633             $(TR $(TD Avestan) $(TD Hiragana)    $(TD Old_South_Arabian))
634             $(TR $(TD Balinese)    $(TD Imperial_Aramaic)    $(TD Old_Turkic))
635             $(TR $(TD Bamum)   $(TD Inherited)   $(TD Oriya))
636             $(TR $(TD Batak)   $(TD Inscriptional_Pahlavi)   $(TD Osmanya))
637             $(TR $(TD Bengali) $(TD Inscriptional_Parthian)  $(TD Phags_Pa))
638             $(TR $(TD Bopomofo)    $(TD Javanese)    $(TD Phoenician))
639             $(TR $(TD Brahmi)  $(TD Kaithi)  $(TD Rejang))
640             $(TR $(TD Braille) $(TD Kannada) $(TD Runic))
641             $(TR $(TD Buginese)    $(TD Katakana)    $(TD Samaritan))
642             $(TR $(TD Buhid)   $(TD Kayah_Li)    $(TD Saurashtra))
643             $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi)  $(TD Sharada))
644             $(TR $(TD Carian)  $(TD Khmer)   $(TD Shavian))
645             $(TR $(TD Chakma)  $(TD Lao) $(TD Sinhala))
646             $(TR $(TD Cham)    $(TD Latin)   $(TD Sora_Sompeng))
647             $(TR $(TD Cherokee)    $(TD Lepcha)  $(TD Sundanese))
648             $(TR $(TD Common)  $(TD Limbu)   $(TD Syloti_Nagri))
649             $(TR $(TD Coptic)  $(TD Linear_B)    $(TD Syriac))
650             $(TR $(TD Cuneiform)   $(TD Lisu)    $(TD Tagalog))
651             $(TR $(TD Cypriot) $(TD Lycian)  $(TD Tagbanwa))
652             $(TR $(TD Cyrillic)    $(TD Lydian)  $(TD Tai_Le))
653             $(TR $(TD Deseret) $(TD Malayalam)   $(TD Tai_Tham))
654             $(TR $(TD Devanagari)  $(TD Mandaic) $(TD Tai_Viet))
655             $(TR $(TD Egyptian_Hieroglyphs)    $(TD Meetei_Mayek)    $(TD Takri))
656             $(TR $(TD Ethiopic)    $(TD Meroitic_Cursive)    $(TD Tamil))
657             $(TR $(TD Georgian)    $(TD Meroitic_Hieroglyphs)    $(TD Telugu))
658             $(TR $(TD Glagolitic)  $(TD Miao)    $(TD Thaana))
659             $(TR $(TD Gothic)  $(TD Mongolian)   $(TD Thai))
660             $(TR $(TD Greek)   $(TD Myanmar) $(TD Tibetan))
661             $(TR $(TD Gujarati)    $(TD New_Tai_Lue) $(TD Tifinagh))
662             $(TR $(TD Gurmukhi)    $(TD Nko) $(TD Ugaritic))
663             $(TR $(TD Han) $(TD Ogham)   $(TD Vai))
664             $(TR $(TD Hangul)  $(TD Ol_Chiki)    $(TD Yi))
665     )
666     $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
667         $(BOOKTABLE $(B Hangul syllable type),
668             $(TR $(TH Abb.) $(TH Long form))
669             $(TR $(TD L)   $(TD Leading_Jamo))
670             $(TR $(TD LV)  $(TD LV_Syllable))
671             $(TR $(TD LVT) $(TD LVT_Syllable) )
672             $(TR $(TD T)   $(TD Trailing_Jamo))
673             $(TR $(TD V)   $(TD Vowel_Jamo))
674     )
675     References:
676         $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
677         $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
678         $(HTTP www.unicode.org, The Unicode Consortium),
679         $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
680         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
681         $(HTTP www.unicode.org/uni2book/ch05.pdf,
682             Unicode Implementation Guidelines)
683         $(HTTP www.unicode.org/uni2book/ch03.pdf,
684             Unicode Conformance)
685     Trademarks:
686         Unicode(tm) is a trademark of Unicode, Inc.
687 
688     Copyright: Copyright 2013 -
689     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
690     Authors:   Dmitry Olshansky
691     Source:    $(PHOBOSSRC std/uni/package.d)
692     Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
693 
694 Macros:
695 
696 SECTION = <h3><a id="$1">$0</a></h3>
697 DEF = <div><a id="$1"><i>$0</i></a></div>
698 S_LINK = <a href="#$1">$+</a>
699 CODEPOINT = $(S_LINK Code point, code point)
700 CODEPOINTS = $(S_LINK Code point, code points)
701 CHARACTER = $(S_LINK Character, character)
702 CHARACTERS = $(S_LINK Character, characters)
703 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
704 +/
705 module std.uni;
706 
707 import std.meta : AliasSeq;
708 import std.range.primitives : back, ElementEncodingType, ElementType, empty,
709     front, hasLength, hasSlicing, isForwardRange, isInputRange,
710     isRandomAccessRange, popFront, put, save;
711 import std.traits : isConvertibleToString, isIntegral, isSomeChar,
712     isSomeString, Unqual, isDynamicArray;
713 // debug = std_uni;
714 
715 import std.internal.unicode_tables; // generated file
716 
717 debug(std_uni) import std.stdio; // writefln, writeln
718 
719 private:
720 
721 
722 void copyBackwards(T,U)(T[] src, U[] dest)
723 {
724     assert(src.length == dest.length);
725     for (size_t i=src.length; i-- > 0; )
726         dest[i] = src[i];
727 }
728 
729 void copyForward(T,U)(T[] src, U[] dest)
730 {
731     assert(src.length == dest.length);
732     for (size_t i=0; i<src.length; i++)
733         dest[i] = src[i];
734 }
735 
736 // TODO: update to reflect all major CPUs supporting unaligned reads
737 version (X86)
738     enum hasUnalignedReads = true;
739 else version (X86_64)
740     enum hasUnalignedReads = true;
741 else version (SystemZ)
742     enum hasUnalignedReads = true;
743 else
744     enum hasUnalignedReads = false; // better be safe then sorry
745 
746 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
747 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
748 public enum dchar nelSep  = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
749 
750 // test the intro example
751 @safe unittest
752 {
753     import std.algorithm.searching : find;
754     // initialize code point sets using script/block or property name
755     // set contains code points from both scripts.
756     auto set = unicode("Cyrillic") | unicode("Armenian");
757     // or simpler and statically-checked look
758     auto ascii = unicode.ASCII;
759     auto currency = unicode.Currency_Symbol;
760 
761     // easy set ops
762     auto a = set & ascii;
763     assert(a.empty); // as it has no intersection with ascii
764     a = set | ascii;
765     auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
766 
767     // some properties of code point sets
768     assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
769     // testing presence of a code point in a set
770     // is just fine, it is O(logN)
771     assert(!b['$']);
772     assert(!b['\u058F']); // Armenian dram sign
773     assert(b['¥']);
774 
775     // building fast lookup tables, these guarantee O(1) complexity
776     // 1-level Trie lookup table essentially a huge bit-set ~262Kb
777     auto oneTrie = toTrie!1(b);
778     // 2-level far more compact but typically slightly slower
779     auto twoTrie = toTrie!2(b);
780     // 3-level even smaller, and a bit slower yet
781     auto threeTrie = toTrie!3(b);
782     assert(oneTrie['£']);
783     assert(twoTrie['£']);
784     assert(threeTrie['£']);
785 
786     // build the trie with the most sensible trie level
787     // and bind it as a functor
788     auto cyrillicOrArmenian = toDelegate(set);
789     auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
790     assert(balance == "ընկեր!");
791     // compatible with bool delegate(dchar)
792     bool delegate(dchar) bindIt = cyrillicOrArmenian;
793 
794     // Normalization
795     string s = "Plain ascii (and not only), is always normalized!";
796     assert(s is normalize(s));// is the same string
797 
798     string nonS = "A\u0308ffin"; // A ligature
799     auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
800     assert(nS == "Äffin");
801     assert(nS != nonS);
802     string composed = "Äffin";
803 
804     assert(normalize!NFD(composed) == "A\u0308ffin");
805     // to NFKD, compatibility decomposition useful for fuzzy matching/searching
806     assert(normalize!NFKD("2¹⁰") == "210");
807 }
808 
809 enum lastDchar = 0x10FFFF;
810 
811 auto force(T, F)(F from)
812 if (isIntegral!T && !is(T == F))
813 {
814     assert(from <= T.max && from >= T.min);
815     return cast(T) from;
816 }
817 
818 auto force(T, F)(F from)
819 if (isBitPacked!T && !is(T == F))
820 {
821     assert(from <= 2^^bitSizeOf!T-1);
822     return T(cast(TypeOfBitPacked!T) from);
823 }
824 
825 auto force(T, F)(F from)
826 if (is(T == F))
827 {
828     return from;
829 }
830 
831 // repeat X times the bit-pattern in val assuming it's length is 'bits'
832 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
833 {
834     static if (times == 1)
835         return val;
836     else static if (bits == 1)
837     {
838         static if (times == size_t.sizeof*8)
839             return val ? size_t.max : 0;
840         else
841             return val ? (1 << times)-1 : 0;
842     }
843     else static if (times % 2)
844         return (replicateBits!(times-1, bits)(val)<<bits) | val;
845     else
846         return replicateBits!(times/2, bits*2)((val << bits) | val);
847 }
848 
849 @safe pure nothrow @nogc unittest // for replicate
850 {
851     import std.algorithm.iteration : sum, map;
852     import std.range : iota;
853     size_t m = 0b111;
854     size_t m2 = 0b01;
855     static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
856     {
857         assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
858         assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
859     }
860 }
861 
862 // multiple arrays squashed into one memory block
863 struct MultiArray(Types...)
864 {
865     import std.range.primitives : isOutputRange;
866     this(size_t[] sizes...) @safe pure nothrow
867     {
868         assert(dim == sizes.length);
869         size_t full_size;
870         foreach (i, v; Types)
871         {
872             full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
873             sz[i] = sizes[i];
874             static if (i >= 1)
875                 offsets[i] = offsets[i-1] +
876                     spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
877         }
878 
879         storage = new size_t[full_size];
880     }
881 
882     this(const(size_t)[] raw_offsets,
883         const(size_t)[] raw_sizes,
884         return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc
885     {
886         offsets[] = raw_offsets[];
887         sz[] = raw_sizes[];
888         storage = data;
889     }
890 
891     @property auto slice(size_t n)()inout pure nothrow @nogc
892     {
893         auto ptr = raw_ptr!n;
894         return packedArrayView!(Types[n])(ptr, sz[n]);
895     }
896 
897     @property auto ptr(size_t n)()inout pure nothrow @nogc
898     {
899         auto ptr = raw_ptr!n;
900         return inout(PackedPtr!(Types[n]))(ptr);
901     }
902 
903     template length(size_t n)
904     {
905         @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
906 
907         @property void length(size_t new_size)
908         {
909             if (new_size > sz[n])
910             {// extend
911                 size_t delta = (new_size - sz[n]);
912                 sz[n] += delta;
913                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
914                 storage.length +=  delta;// extend space at end
915                 // raw_slice!x must follow resize as it could be moved!
916                 // next stmts move all data past this array, last-one-goes-first
917                 static if (n != dim-1)
918                 {
919                     auto start = raw_ptr!(n+1);
920                     // len includes delta
921                     size_t len = (storage.ptr+storage.length-start);
922 
923                     copyBackwards(start[0 .. len-delta], start[delta .. len]);
924 
925                     start[0 .. delta] = 0;
926                     // offsets are used for raw_slice, ptr etc.
927                     foreach (i; n+1 .. dim)
928                         offsets[i] += delta;
929                 }
930             }
931             else if (new_size < sz[n])
932             {// shrink
933                 size_t delta = (sz[n] - new_size);
934                 sz[n] -= delta;
935                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
936                 // move all data past this array, forward direction
937                 static if (n != dim-1)
938                 {
939                     auto start = raw_ptr!(n+1);
940                     size_t len = (storage.ptr+storage.length-start);
941                     copyForward(start[0 .. len-delta], start[delta .. len]);
942 
943                     // adjust offsets last, they affect raw_slice
944                     foreach (i; n+1 .. dim)
945                         offsets[i] -= delta;
946                 }
947                 storage.length -= delta;
948             }
949             // else - NOP
950         }
951     }
952 
953     @property size_t bytes(size_t n=size_t.max)() const @safe
954     {
955         static if (n == size_t.max)
956             return storage.length*size_t.sizeof;
957         else static if (n != Types.length-1)
958             return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
959         else
960             return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
961     }
962 
963     void store(OutRange)(scope OutRange sink) const
964         if (isOutputRange!(OutRange, char))
965     {
966         import std.format.write : formattedWrite;
967         formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
968         formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
969         formattedWrite(sink, ", [%( 0x%x, %)]", storage);
970     }
971 
972 private:
973     import std.meta : staticMap;
974     @property auto raw_ptr(size_t n)()inout pure nothrow @nogc
975     {
976         static if (n == 0)
977             return storage.ptr;
978         else
979         {
980             return storage.ptr+offsets[n];
981         }
982     }
983     enum dim = Types.length;
984     size_t[dim] offsets;// offset for level x
985     size_t[dim] sz;// size of level x
986     alias bitWidth = staticMap!(bitSizeOf, Types);
987     size_t[] storage;
988 }
989 
990 @system unittest
991 {
992     import std.conv : text;
993     enum dg = (){
994         // sizes are:
995         // lvl0: 3, lvl1 : 2, lvl2: 1
996         auto m = MultiArray!(int, ubyte, int)(3,2,1);
997 
998         static void check(size_t k, T)(ref T m, int n)
999         {
1000             foreach (i; 0 .. n)
1001                 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
1002         }
1003 
1004         static void checkB(size_t k, T)(ref T m, int n)
1005         {
1006             foreach (i; 0 .. n)
1007                 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
1008         }
1009 
1010         static void fill(size_t k, T)(ref T m, int n)
1011         {
1012             foreach (i; 0 .. n)
1013                 m.slice!(k)[i] = force!ubyte(i+1);
1014         }
1015 
1016         static void fillB(size_t k, T)(ref T m, int n)
1017         {
1018             foreach (i; 0 .. n)
1019                 m.slice!(k)[i] = force!ubyte(n-i);
1020         }
1021 
1022         m.length!1 = 100;
1023         fill!1(m, 100);
1024         check!1(m, 100);
1025 
1026         m.length!0 = 220;
1027         fill!0(m, 220);
1028         check!1(m, 100);
1029         check!0(m, 220);
1030 
1031         m.length!2 = 17;
1032         fillB!2(m, 17);
1033         checkB!2(m, 17);
1034         check!0(m, 220);
1035         check!1(m, 100);
1036 
1037         m.length!2 = 33;
1038         checkB!2(m, 17);
1039         fillB!2(m, 33);
1040         checkB!2(m, 33);
1041         check!0(m, 220);
1042         check!1(m, 100);
1043 
1044         m.length!1 = 195;
1045         fillB!1(m, 195);
1046         checkB!1(m, 195);
1047         checkB!2(m, 33);
1048         check!0(m, 220);
1049 
1050         auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
1051         marr.length!0 = 15;
1052         marr.length!1 = 30;
1053         fill!1(marr, 30);
1054         fill!0(marr, 15);
1055         check!1(marr, 30);
1056         check!0(marr, 15);
1057         return 0;
1058     };
1059     enum ct = dg();
1060     auto rt = dg();
1061 }
1062 
1063 @system unittest
1064 {// more bitpacking tests
1065     import std.conv : text;
1066 
1067     alias Bitty =
1068       MultiArray!(BitPacked!(size_t, 3)
1069                 , BitPacked!(size_t, 4)
1070                 , BitPacked!(size_t, 3)
1071                 , BitPacked!(size_t, 6)
1072                 , bool);
1073     alias fn1 = sliceBits!(13, 16);
1074     alias fn2 = sliceBits!( 9, 13);
1075     alias fn3 = sliceBits!( 6,  9);
1076     alias fn4 = sliceBits!( 0,  6);
1077     static void check(size_t lvl, MA)(ref MA arr){
1078         for (size_t i = 0; i< arr.length!lvl; i++)
1079             assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
1080     }
1081 
1082     static void fillIdx(size_t lvl, MA)(ref MA arr){
1083         for (size_t i = 0; i< arr.length!lvl; i++)
1084             arr.slice!(lvl)[i] = i;
1085     }
1086     Bitty m1;
1087 
1088     m1.length!4 = 10;
1089     m1.length!3 = 2^^6;
1090     m1.length!2 = 2^^3;
1091     m1.length!1 = 2^^4;
1092     m1.length!0 = 2^^3;
1093 
1094     m1.length!4 = 2^^16;
1095 
1096     for (size_t i = 0; i< m1.length!4; i++)
1097         m1.slice!(4)[i] = i % 2;
1098 
1099     fillIdx!1(m1);
1100     check!1(m1);
1101     fillIdx!2(m1);
1102     check!2(m1);
1103     fillIdx!3(m1);
1104     check!3(m1);
1105     fillIdx!0(m1);
1106     check!0(m1);
1107     check!3(m1);
1108     check!2(m1);
1109     check!1(m1);
1110     for (size_t i=0; i < 2^^16; i++)
1111     {
1112         m1.slice!(4)[i] = i % 2;
1113         m1.slice!(0)[fn1(i)] = fn1(i);
1114         m1.slice!(1)[fn2(i)] = fn2(i);
1115         m1.slice!(2)[fn3(i)] = fn3(i);
1116         m1.slice!(3)[fn4(i)] = fn4(i);
1117     }
1118     for (size_t i=0; i < 2^^16; i++)
1119     {
1120         assert(m1.slice!(4)[i] == i % 2);
1121         assert(m1.slice!(0)[fn1(i)] == fn1(i));
1122         assert(m1.slice!(1)[fn2(i)] == fn2(i));
1123         assert(m1.slice!(2)[fn3(i)] == fn3(i));
1124         assert(m1.slice!(3)[fn4(i)] == fn4(i));
1125     }
1126 }
1127 
1128 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
1129 {
1130     import std.math.algebraic : nextPow2;
1131     enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
1132     static if (bits > 8*size_t.sizeof)
1133     {
1134         static assert(bits % (size_t.sizeof*8) == 0);
1135         return new_len * bits/(8*size_t.sizeof);
1136     }
1137     else
1138     {
1139         enum factor = size_t.sizeof*8/bits;
1140         return (new_len+factor-1)/factor; // rounded up
1141     }
1142 }
1143 
1144 template isBitPackableType(T)
1145 {
1146     enum isBitPackableType = isBitPacked!T
1147         || isIntegral!T || is(T == bool) || isSomeChar!T;
1148 }
1149 
1150 //============================================================================
1151 template PackedArrayView(T)
1152 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1153     && isBitPackableType!U) || isBitPackableType!T)
1154 {
1155     import std.math.algebraic : nextPow2;
1156     private enum bits = bitSizeOf!T;
1157     alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1158 }
1159 
1160 //unsafe and fast access to a chunk of RAM as if it contains packed values
1161 template PackedPtr(T)
1162 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1163     && isBitPackableType!U) || isBitPackableType!T)
1164 {
1165     import std.math.algebraic : nextPow2;
1166     private enum bits = bitSizeOf!T;
1167     alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1168 }
1169 
1170 struct PackedPtrImpl(T, size_t bits)
1171 {
1172 pure nothrow:
1173     static assert(isPow2OrZero(bits));
1174 
1175     this(inout(size_t)* ptr)inout @safe @nogc
1176     {
1177         origin = ptr;
1178     }
1179 
1180     private T simpleIndex(size_t n) inout
1181     {
1182         immutable q = n / factor;
1183         immutable r = n % factor;
1184         return cast(T)((origin[q] >> bits*r) & mask);
1185     }
1186 
1187     private void simpleWrite(TypeOfBitPacked!T val, size_t n)
1188     in
1189     {
1190         static if (isIntegral!T)
1191             assert(val <= mask);
1192     }
1193     do
1194     {
1195         immutable q = n / factor;
1196         immutable r = n % factor;
1197         immutable tgt_shift = bits*r;
1198         immutable word = origin[q];
1199         origin[q] = (word & ~(mask << tgt_shift))
1200             | (cast(size_t) val << tgt_shift);
1201     }
1202 
1203     static if (factor == bytesPerWord// can safely pack by byte
1204          || factor == 1 // a whole word at a time
1205          || ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
1206                 && hasUnalignedReads)) // this needs unaligned reads
1207     {
1208         static if (factor == bytesPerWord)
1209             alias U = ubyte;
1210         else static if (factor == bytesPerWord/2)
1211             alias U = ushort;
1212         else static if (factor == bytesPerWord/4)
1213             alias U = uint;
1214         else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
1215             alias U = ulong;
1216 
1217         T opIndex(size_t idx) inout
1218         {
1219             T ret;
1220             version (LittleEndian)
1221                 ret = __ctfe ? simpleIndex(idx) :
1222                     cast(inout(T))(cast(U*) origin)[idx];
1223             else
1224                 ret = simpleIndex(idx);
1225             return ret;
1226         }
1227 
1228         static if (isBitPacked!T) // lack of user-defined implicit conversion
1229         {
1230             void opIndexAssign(T val, size_t idx)
1231             {
1232                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1233             }
1234         }
1235 
1236         void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1237         {
1238             version (LittleEndian)
1239             {
1240                 if (__ctfe)
1241                     simpleWrite(val, idx);
1242                 else
1243                     (cast(U*) origin)[idx] = cast(U) val;
1244             }
1245             else
1246                 simpleWrite(val, idx);
1247         }
1248     }
1249     else
1250     {
1251         T opIndex(size_t n) inout
1252         {
1253             return simpleIndex(n);
1254         }
1255 
1256         static if (isBitPacked!T) // lack of user-defined implicit conversion
1257         {
1258             void opIndexAssign(T val, size_t idx)
1259             {
1260                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1261             }
1262         }
1263 
1264         void opIndexAssign(TypeOfBitPacked!T val, size_t n)
1265         {
1266             return simpleWrite(val, n);
1267         }
1268     }
1269 
1270 private:
1271     // factor - number of elements in one machine word
1272     enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
1273     enum bytesPerWord =  size_t.sizeof;
1274     size_t* origin;
1275 }
1276 
1277 // data is packed only by power of two sized packs per word,
1278 // thus avoiding mul/div overhead at the cost of ultimate packing
1279 // this construct doesn't own memory, only provides access, see MultiArray for usage
1280 struct PackedArrayViewImpl(T, size_t bits)
1281 {
1282 pure nothrow:
1283 
1284     this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
1285     {
1286         ptr = inout(PackedPtr!(T))(origin);
1287         ofs = offset;
1288         limit = items;
1289     }
1290 
1291     bool zeros(size_t s, size_t e)
1292     in
1293     {
1294         assert(s <= e);
1295     }
1296     do
1297     {
1298         s += ofs;
1299         e += ofs;
1300         immutable pad_s = roundUp(s);
1301         if ( s >= e)
1302         {
1303             foreach (i; s .. e)
1304                 if (ptr[i])
1305                     return false;
1306             return true;
1307         }
1308         immutable pad_e = roundDown(e);
1309         size_t i;
1310         for (i=s; i<pad_s; i++)
1311             if (ptr[i])
1312                 return false;
1313         // all in between is x*factor elements
1314         for (size_t j=i/factor; i<pad_e; i+=factor, j++)
1315             if (ptr.origin[j])
1316                 return false;
1317         for (; i<e; i++)
1318             if (ptr[i])
1319                 return false;
1320         return true;
1321     }
1322 
1323     T opIndex(size_t idx) inout
1324     in
1325     {
1326         assert(idx < limit);
1327     }
1328     do
1329     {
1330         return ptr[ofs + idx];
1331     }
1332 
1333     static if (isBitPacked!T) // lack of user-defined implicit conversion
1334     {
1335         void opIndexAssign(T val, size_t idx)
1336         {
1337             return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1338         }
1339     }
1340 
1341     void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1342     in
1343     {
1344         assert(idx < limit);
1345     }
1346     do
1347     {
1348         ptr[ofs + idx] = val;
1349     }
1350 
1351     static if (isBitPacked!T) // lack of user-defined implicit conversions
1352     {
1353         void opSliceAssign(T val, size_t start, size_t end)
1354         {
1355             opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
1356         }
1357     }
1358 
1359     void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
1360     in
1361     {
1362         assert(start <= end);
1363         assert(end <= limit);
1364     }
1365     do
1366     {
1367         // account for ofsetted view
1368         start += ofs;
1369         end += ofs;
1370         // rounded to factor granularity
1371         immutable pad_start = roundUp(start);// rounded up
1372         if (pad_start >= end) //rounded up >= then end of slice
1373         {
1374             //nothing to gain, use per element assignment
1375             foreach (i; start .. end)
1376                 ptr[i] = val;
1377             return;
1378         }
1379         immutable pad_end = roundDown(end); // rounded down
1380         size_t i;
1381         for (i=start; i<pad_start; i++)
1382             ptr[i] = val;
1383         // all in between is x*factor elements
1384         if (pad_start != pad_end)
1385         {
1386             immutable repval = replicateBits!(factor, bits)(val);
1387             for (size_t j=i/factor; i<pad_end; i+=factor, j++)
1388                 ptr.origin[j] = repval;// so speed it up by factor
1389         }
1390         for (; i<end; i++)
1391             ptr[i] = val;
1392     }
1393 
1394     auto opSlice(size_t from, size_t to)inout
1395     in
1396     {
1397         assert(from <= to);
1398         assert(ofs + to <= limit);
1399     }
1400     do
1401     {
1402         return typeof(this)(ptr.origin, ofs + from, to - from);
1403     }
1404 
1405     auto opSlice(){ return opSlice(0, length); }
1406 
1407     bool opEquals(T)(auto ref T arr) const
1408     {
1409         if (limit != arr.limit)
1410            return false;
1411         size_t s1 = ofs, s2 = arr.ofs;
1412         size_t e1 = s1 + limit, e2 = s2 + limit;
1413         if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
1414         {
1415             return ptr.origin[s1/factor .. e1/factor]
1416                 == arr.ptr.origin[s2/factor .. e2/factor];
1417         }
1418         for (size_t i=0;i<limit; i++)
1419             if (this[i] != arr[i])
1420                 return false;
1421         return true;
1422     }
1423 
1424     @property size_t length()const{ return limit; }
1425 
1426 private:
1427     auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
1428     auto roundDown()(size_t val){ return val/factor*factor; }
1429     // factor - number of elements in one machine word
1430     enum factor = size_t.sizeof*8/bits;
1431     PackedPtr!(T) ptr;
1432     size_t ofs, limit;
1433 }
1434 
1435 
1436 private struct SliceOverIndexed(T)
1437 {
1438     enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
1439     enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
1440     auto opIndex(size_t idx)const
1441     in
1442     {
1443         assert(idx < to - from);
1444     }
1445     do
1446     {
1447         return (*arr)[from+idx];
1448     }
1449 
1450     static if (assignableIndex)
1451     void opIndexAssign(Item val, size_t idx)
1452     in
1453     {
1454         assert(idx < to - from);
1455     }
1456     do
1457     {
1458        (*arr)[from+idx] = val;
1459     }
1460 
1461     auto opSlice(size_t a, size_t b)
1462     {
1463         return typeof(this)(from+a, from+b, arr);
1464     }
1465 
1466     // static if (assignableSlice)
1467     void opSliceAssign(T)(T val, size_t start, size_t end)
1468     {
1469         (*arr)[start+from .. end+from] = val;
1470     }
1471 
1472     auto opSlice()
1473     {
1474         return typeof(this)(from, to, arr);
1475     }
1476 
1477     @property size_t length()const { return to-from;}
1478 
1479     alias opDollar = length;
1480 
1481     @property bool empty()const { return from == to; }
1482 
1483     @property auto front()const { return (*arr)[from]; }
1484 
1485     static if (assignableIndex)
1486     @property void front(Item val) { (*arr)[from] = val; }
1487 
1488     @property auto back()const { return (*arr)[to-1]; }
1489 
1490     static if (assignableIndex)
1491     @property void back(Item val) { (*arr)[to-1] = val; }
1492 
1493     @property auto save() inout { return this; }
1494 
1495     void popFront() {   from++; }
1496 
1497     void popBack() {    to--; }
1498 
1499     bool opEquals(T)(auto ref T arr) const
1500     {
1501         if (arr.length != length)
1502             return false;
1503         for (size_t i=0; i <length; i++)
1504             if (this[i] != arr[i])
1505                 return false;
1506         return true;
1507     }
1508 private:
1509     alias Item = typeof(T.init[0]);
1510     size_t from, to;
1511     T* arr;
1512 }
1513 
1514 @safe pure nothrow @nogc unittest
1515 {
1516     static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
1517 }
1518 
1519 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
1520 if (is(Unqual!T == T))
1521 {
1522     return SliceOverIndexed!(const(T))(a, b, x);
1523 }
1524 
1525 // BUG? inout is out of reach
1526 //...SliceOverIndexed.arr only parameters or stack based variables can be inout
1527 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
1528 if (is(Unqual!T == T))
1529 {
1530     return SliceOverIndexed!T(a, b, x);
1531 }
1532 
1533 @system unittest
1534 {
1535     int[] idxArray = [2, 3, 5, 8, 13];
1536     auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
1537 
1538     assert(!sliced.empty);
1539     assert(sliced.front == 2);
1540     sliced.front = 1;
1541     assert(sliced.front == 1);
1542     assert(sliced.back == 13);
1543     sliced.popFront();
1544     assert(sliced.front == 3);
1545     assert(sliced.back == 13);
1546     sliced.back = 11;
1547     assert(sliced.back == 11);
1548     sliced.popBack();
1549 
1550     assert(sliced.front == 3);
1551     assert(sliced[$-1] == 8);
1552     sliced = sliced[];
1553     assert(sliced[0] == 3);
1554     assert(sliced.back == 8);
1555     sliced = sliced[1..$];
1556     assert(sliced.front == 5);
1557     sliced = sliced[0..$-1];
1558     assert(sliced[$-1] == 5);
1559 
1560     int[] other = [2, 5];
1561     assert(sliced[] == sliceOverIndexed(1, 2, &other));
1562     sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
1563     assert(idxArray[0 .. 2] == [-1, -1]);
1564     uint[] nullArr = null;
1565     auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
1566     assert(nullSlice.empty);
1567 }
1568 
1569 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items)
1570 {
1571     return inout(PackedArrayView!T)(ptr, 0, items);
1572 }
1573 
1574 
1575 //============================================================================
1576 // Partially unrolled binary search using Shar's method
1577 //============================================================================
1578 
1579 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
1580 {
1581     import core.bitop : bsr;
1582     import std.array : replace;
1583     import std.conv : to;
1584     assert(isPow2OrZero(size));
1585     string code = `
1586     import core.bitop : bsr;
1587     auto power = bsr(m)+1;
1588     switch (power){`;
1589     size_t i = bsr(size);
1590     foreach_reverse (val; 0 .. bsr(size))
1591     {
1592         auto v = 2^^val;
1593         code ~= `
1594         case pow:
1595             if (pred(range[idx+m], needle))
1596                 idx +=  m;
1597             goto case;
1598         `.replace("m", to!string(v))
1599         .replace("pow", to!string(i));
1600         i--;
1601     }
1602     code ~= `
1603         case 0:
1604             if (pred(range[idx], needle))
1605                 idx += 1;
1606             goto default;
1607         `;
1608     code ~= `
1609         default:
1610     }`;
1611     return code;
1612 }
1613 
1614 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
1615 {
1616     // See also: std.math.isPowerOf2()
1617     return (sz & (sz-1)) == 0;
1618 }
1619 
1620 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
1621 if (is(T : ElementType!Range))
1622 {
1623     assert(isPow2OrZero(range.length));
1624     size_t idx = 0, m = range.length/2;
1625     while (m != 0)
1626     {
1627         if (pred(range[idx+m], needle))
1628             idx += m;
1629         m /= 2;
1630     }
1631     if (pred(range[idx], needle))
1632         idx += 1;
1633     return idx;
1634 }
1635 
1636 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
1637 if (is(T : ElementType!Range))
1638 {
1639     assert(isPow2OrZero(range.length));
1640     size_t idx = 0, m = range.length/2;
1641     enum max = 1 << 10;
1642     while (m >= max)
1643     {
1644         if (pred(range[idx+m], needle))
1645             idx += m;
1646         m /= 2;
1647     }
1648     mixin(genUnrolledSwitchSearch(max));
1649     return idx;
1650 }
1651 
1652 template sharMethod(alias uniLowerBound)
1653 {
1654     size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
1655         if (is(T : ElementType!Range))
1656     {
1657         import std.functional : binaryFun;
1658         import std.math.algebraic : nextPow2, truncPow2;
1659         alias pred = binaryFun!_pred;
1660         if (range.length == 0)
1661             return 0;
1662         if (isPow2OrZero(range.length))
1663             return uniLowerBound!pred(range, needle);
1664         size_t n = truncPow2(range.length);
1665         if (pred(range[n-1], needle))
1666         {// search in another 2^^k area that fully covers the tail of range
1667             size_t k = nextPow2(range.length - n + 1);
1668             return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
1669         }
1670         else
1671             return uniLowerBound!pred(range[0 .. n], needle);
1672     }
1673 }
1674 
1675 alias sharLowerBound = sharMethod!uniformLowerBound;
1676 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
1677 
1678 @safe unittest
1679 {
1680     import std.array : array;
1681     import std.range : assumeSorted, iota;
1682 
1683     auto stdLowerBound(T)(T[] range, T needle)
1684     {
1685         return assumeSorted(range).lowerBound(needle).length;
1686     }
1687     immutable MAX = 5*1173;
1688     auto arr = array(iota(5, MAX, 5));
1689     assert(arr.length == MAX/5-1);
1690     foreach (i; 0 .. MAX+5)
1691     {
1692         auto st = stdLowerBound(arr, i);
1693         assert(st == sharLowerBound(arr, i));
1694         assert(st == sharSwitchLowerBound(arr, i));
1695     }
1696     arr = [];
1697     auto st = stdLowerBound(arr, 33);
1698     assert(st == sharLowerBound(arr, 33));
1699     assert(st == sharSwitchLowerBound(arr, 33));
1700 }
1701 //============================================================================
1702 
1703 @safe
1704 {
1705 // hope to see simillar stuff in public interface... once Allocators are out
1706 //@@@BUG moveFront and friends? dunno, for now it's POD-only
1707 
1708 @trusted size_t genericReplace(Policy=void, T, Range)
1709     (ref T dest, size_t from, size_t to, Range stuff)
1710 {
1711     import std.algorithm.mutation : copy;
1712     size_t delta = to - from;
1713     size_t stuff_end = from+stuff.length;
1714     if (stuff.length > delta)
1715     {// replace increases length
1716         delta = stuff.length - delta;// now, new is > old  by delta
1717         static if (is(Policy == void))
1718             dest.length = dest.length+delta;//@@@BUG lame @property
1719         else
1720             dest = Policy.realloc(dest, dest.length+delta);
1721         copyBackwards(dest[to .. dest.length-delta],
1722             dest[to+delta .. dest.length]);
1723         copyForward(stuff, dest[from .. stuff_end]);
1724     }
1725     else if (stuff.length == delta)
1726     {
1727         copy(stuff, dest[from .. to]);
1728     }
1729     else
1730     {// replace decreases length by delta
1731         delta = delta - stuff.length;
1732         copy(stuff, dest[from .. stuff_end]);
1733         copyForward(dest[to .. dest.length],
1734             dest[stuff_end .. dest.length-delta]);
1735         static if (is(Policy == void))
1736             dest.length = dest.length - delta;//@@@BUG lame @property
1737         else
1738             dest = Policy.realloc(dest, dest.length-delta);
1739     }
1740     return stuff_end;
1741 }
1742 
1743 
1744 // Simple storage manipulation policy
1745 @safe private struct GcPolicy
1746 {
1747     import std.traits : isDynamicArray;
1748 
1749     static T[] dup(T)(const T[] arr)
1750     {
1751         return arr.dup;
1752     }
1753 
1754     static T[] alloc(T)(size_t size)
1755     {
1756         return new T[size];
1757     }
1758 
1759     static T[] realloc(T)(T[] arr, size_t sz)
1760     {
1761         arr.length = sz;
1762         return arr;
1763     }
1764 
1765     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1766     {
1767         replaceInPlace(dest, from, to, stuff);
1768     }
1769 
1770     static void append(T, V)(ref T[] arr, V value)
1771         if (!isInputRange!V)
1772     {
1773         arr ~= force!T(value);
1774     }
1775 
1776     static void append(T, V)(ref T[] arr, V value)
1777         if (isInputRange!V)
1778     {
1779         insertInPlace(arr, arr.length, value);
1780     }
1781 
1782     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1783         if (isDynamicArray!T && is(Unqual!T == T))
1784     {
1785         debug
1786         {
1787             arr[] = cast(typeof(T.init[0]))(0xdead_beef);
1788         }
1789         arr = null;
1790     }
1791 
1792     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1793         if (isDynamicArray!T && !is(Unqual!T == T))
1794     {
1795         arr = null;
1796     }
1797 }
1798 
1799 // ditto
1800 @safe struct ReallocPolicy
1801 {
1802     import std.range.primitives : hasLength;
1803 
1804     static T[] dup(T)(const T[] arr)
1805     {
1806         auto result = alloc!T(arr.length);
1807         result[] = arr[];
1808         return result;
1809     }
1810 
1811     static T[] alloc(T)(size_t size) @trusted
1812     {
1813         import std.internal.memory : enforceMalloc;
1814 
1815         import core.checkedint : mulu;
1816         bool overflow;
1817         size_t nbytes = mulu(size, T.sizeof, overflow);
1818         if (overflow) assert(0);
1819 
1820         auto ptr = cast(T*) enforceMalloc(nbytes);
1821         return ptr[0 .. size];
1822     }
1823 
1824     static T[] realloc(T)(return scope T[] arr, size_t size) @trusted
1825     {
1826         import std.internal.memory : enforceRealloc;
1827         if (!size)
1828         {
1829             destroy(arr);
1830             return null;
1831         }
1832 
1833         import core.checkedint : mulu;
1834         bool overflow;
1835         size_t nbytes = mulu(size, T.sizeof, overflow);
1836         if (overflow) assert(0);
1837 
1838         auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes);
1839         return ptr[0 .. size];
1840     }
1841 
1842     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1843     {
1844         genericReplace!(ReallocPolicy)(dest, from, to, stuff);
1845     }
1846 
1847     static void append(T, V)(ref T[] arr, V value)
1848         if (!isInputRange!V)
1849     {
1850         if (arr.length == size_t.max) assert(0);
1851         arr = realloc(arr, arr.length+1);
1852         arr[$-1] = force!T(value);
1853     }
1854 
1855     pure @safe unittest
1856     {
1857         int[] arr;
1858         ReallocPolicy.append(arr, 3);
1859 
1860         import std.algorithm.comparison : equal;
1861         assert(equal(arr, [3]));
1862     }
1863 
1864     static void append(T, V)(ref T[] arr, V value)
1865         if (isInputRange!V && hasLength!V)
1866     {
1867         import core.checkedint : addu;
1868         bool overflow;
1869         size_t nelems = addu(arr.length, value.length, overflow);
1870         if (overflow) assert(0);
1871 
1872         arr = realloc(arr, nelems);
1873 
1874         import std.algorithm.mutation : copy;
1875         copy(value, arr[$-value.length..$]);
1876     }
1877 
1878     pure @safe unittest
1879     {
1880         int[] arr;
1881         ReallocPolicy.append(arr, [1,2,3]);
1882 
1883         import std.algorithm.comparison : equal;
1884         assert(equal(arr, [1,2,3]));
1885     }
1886 
1887     static void destroy(T)(scope ref T[] arr) @trusted
1888     {
1889         import core.memory : pureFree;
1890         if (arr.ptr)
1891             pureFree(arr.ptr);
1892         arr = null;
1893     }
1894 }
1895 
1896 //build hack
1897 alias _RealArray = CowArray!ReallocPolicy;
1898 
1899 pure @safe unittest
1900 {
1901     import std.algorithm.comparison : equal;
1902 
1903     with(ReallocPolicy)
1904     {
1905         bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
1906                    string file = __FILE__, size_t line = __LINE__)
1907         {
1908             {
1909                 replaceImpl(orig, from, to, toReplace);
1910                 scope(exit) destroy(orig);
1911                 if (!equal(orig, result))
1912                     return false;
1913             }
1914             return true;
1915         }
1916         static T[] arr(T)(T[] args... )
1917         {
1918             return dup(args);
1919         }
1920 
1921         assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
1922         assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
1923         assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
1924         assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
1925         assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
1926     }
1927 }
1928 
1929 /**
1930     Tests if T is some kind a set of code points. Intended for template constraints.
1931 */
1932 public template isCodepointSet(T)
1933 {
1934     static if (is(T dummy == InversionList!(Args), Args...))
1935         enum isCodepointSet = true;
1936     else
1937         enum isCodepointSet = false;
1938 }
1939 
1940 /**
1941     Tests if `T` is a pair of integers that implicitly convert to `V`.
1942     The following code must compile for any pair `T`:
1943     ---
1944     (T x){ V a = x[0]; V b = x[1];}
1945     ---
1946     The following must not compile:
1947      ---
1948     (T x){ V c = x[2];}
1949     ---
1950 */
1951 public template isIntegralPair(T, V=uint)
1952 {
1953     enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
1954         && !is(typeof((T x){ V c = x[2]; }));
1955 }
1956 
1957 
1958 /**
1959     The recommended default type for set of $(CODEPOINTS).
1960     For details, see the current implementation: $(LREF InversionList).
1961 */
1962 public alias CodepointSet = InversionList!GcPolicy;
1963 
1964 
1965 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
1966 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error
1967 // hence below doesn't seem to work
1968 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
1969 
1970 /**
1971     The recommended type of $(REF Tuple, std,_typecons)
1972     to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
1973     Any interval type should pass $(LREF isIntegralPair) trait.
1974 */
1975 public struct CodepointInterval
1976 {
1977 pure:
1978     uint[2] _tuple;
1979     alias _tuple this;
1980 
1981 @safe pure nothrow @nogc:
1982 
1983     this(uint low, uint high)
1984     {
1985         _tuple[0] = low;
1986         _tuple[1] = high;
1987     }
1988     bool opEquals(T)(T val) const
1989     {
1990         return this[0] == val[0] && this[1] == val[1];
1991     }
1992     @property ref inout(uint) a() return inout { return _tuple[0]; }
1993     @property ref inout(uint) b() return inout { return _tuple[1]; }
1994 }
1995 
1996 /**
1997     $(P
1998     `InversionList` is a set of $(CODEPOINTS)
1999     represented as an array of open-right [a, b$(RPAREN)
2000     intervals (see $(LREF CodepointInterval) above).
2001     The name comes from the way the representation reads left to right.
2002     For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
2003     plus a singular value 60 looks like this:
2004     )
2005     ---
2006     10, 50, 60, 61, 80, 90
2007     ---
2008     $(P
2009     The way to read this is: start with negative meaning that all numbers
2010     smaller then the next one are not present in this set (and positive -
2011     the contrary). Then switch positive/negative after each
2012     number passed from left to right.
2013     )
2014     $(P This way negative spans until 10, then positive until 50,
2015     then negative until 60, then positive until 61, and so on.
2016     As seen this provides a space-efficient storage of highly redundant data
2017     that comes in long runs. A description which Unicode $(CHARACTER)
2018     properties fit nicely. The technique itself could be seen as a variation
2019     on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
2020     )
2021 
2022     $(P Sets are value types (just like `int` is) thus they
2023         are never aliased.
2024     )
2025         Example:
2026         ---
2027         auto a = CodepointSet('a', 'z'+1);
2028         auto b = CodepointSet('A', 'Z'+1);
2029         auto c = a;
2030         a = a | b;
2031         assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
2032         assert(a != c);
2033         ---
2034     $(P See also $(LREF unicode) for simpler construction of sets
2035         from predefined ones.
2036     )
2037 
2038     $(P Memory usage is 8 bytes per each contiguous interval in a set.
2039     The value semantics are achieved by using the
2040     $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
2041     and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
2042     )
2043 
2044     Note:
2045     $(P It's not recommended to rely on the template parameters
2046     or the exact type of a current $(CODEPOINT) set in `std.uni`.
2047     The type and parameters may change when the standard
2048     allocators design is finalized.
2049     Use $(LREF isCodepointSet) with templates or just stick with the default
2050     alias $(LREF CodepointSet) throughout the whole code base.
2051     )
2052 */
2053 public struct InversionList(SP=GcPolicy)
2054 {
2055     import std.range : assumeSorted;
2056 
2057     /**
2058         Construct from another code point set of any type.
2059     */
2060     this(Set)(Set set) pure
2061         if (isCodepointSet!Set)
2062     {
2063         uint[] arr;
2064         foreach (v; set.byInterval)
2065         {
2066             arr ~= v.a;
2067             arr ~= v.b;
2068         }
2069         data = CowArray!(SP).reuse(arr);
2070     }
2071 
2072     /**
2073         Construct a set from a forward range of code point intervals.
2074     */
2075     this(Range)(Range intervals) pure
2076         if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
2077     {
2078         uint[] arr;
2079         foreach (v; intervals)
2080         {
2081             SP.append(arr, v.a);
2082             SP.append(arr, v.b);
2083         }
2084         data = CowArray!(SP).reuse(arr);
2085         sanitize(); //enforce invariant: sort intervals etc.
2086     }
2087 
2088     //helper function that avoids sanity check to be CTFE-friendly
2089     private static fromIntervals(Range)(Range intervals) pure
2090     {
2091         import std.algorithm.iteration : map;
2092         import std.range : roundRobin;
2093         auto flattened = roundRobin(intervals.save.map!"a[0]"(),
2094             intervals.save.map!"a[1]"());
2095         InversionList set;
2096         set.data = CowArray!(SP)(flattened);
2097         return set;
2098     }
2099     //ditto untill sort is CTFE-able
2100     private static fromIntervals()(uint[] intervals...) pure
2101     in
2102     {
2103         import std.conv : text;
2104         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2105         for (uint i = 0; i < intervals.length; i += 2)
2106         {
2107             auto a = intervals[i], b = intervals[i+1];
2108             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2109         }
2110     }
2111     do
2112     {
2113         InversionList set;
2114         set.data = CowArray!(SP)(intervals);
2115         return set;
2116     }
2117 
2118     /**
2119         Construct a set from plain values of code point intervals.
2120     */
2121     this()(uint[] intervals...)
2122     in
2123     {
2124         import std.conv : text;
2125         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2126         for (uint i = 0; i < intervals.length; i += 2)
2127         {
2128             auto a = intervals[i], b = intervals[i+1];
2129             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2130         }
2131     }
2132     do
2133     {
2134         data = CowArray!(SP)(intervals);
2135         sanitize(); //enforce invariant: sort intervals etc.
2136     }
2137 
2138     ///
2139     pure @safe unittest
2140     {
2141         import std.algorithm.comparison : equal;
2142 
2143         auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
2144         foreach (v; 'a'..'z'+1)
2145             assert(set[v]);
2146         // Cyrillic lowercase interval
2147         foreach (v; 'а'..'я'+1)
2148             assert(set[v]);
2149         //specific order is not required, intervals may interesect
2150         auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
2151         //the same end result
2152         assert(set2.byInterval.equal(set.byInterval));
2153         // test constructor this(Range)(Range intervals)
2154         auto chessPiecesWhite = CodepointInterval(9812, 9818);
2155         auto chessPiecesBlack = CodepointInterval(9818, 9824);
2156         auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]);
2157         foreach (v; '♔'..'♟'+1)
2158             assert(set3[v]);
2159     }
2160 
2161     /**
2162         Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
2163     */
2164     @property auto byInterval() scope
2165     {
2166         // TODO: change this to data[] once the -dip1000 errors have been fixed
2167         // see e.g. https://github.com/dlang/phobos/pull/6638
2168         import std.array : array;
2169         return Intervals!(typeof(data.array))(data.array);
2170     }
2171 
2172     @safe unittest
2173     {
2174         import std.algorithm.comparison : equal;
2175         import std.typecons : tuple;
2176 
2177         auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
2178 
2179         assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
2180     }
2181 
2182     package(std) @property const(CodepointInterval)[] intervals() const
2183     {
2184         import std.array : array;
2185         return Intervals!(typeof(data[]))(data[]).array;
2186     }
2187 
2188     /**
2189         Tests the presence of code point `val` in this set.
2190     */
2191     bool opIndex(uint val) const
2192     {
2193         // the <= ensures that searching in  interval of [a, b) for 'a' you get .length == 1
2194         // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
2195         return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
2196     }
2197 
2198     ///
2199     pure @safe unittest
2200     {
2201         auto gothic = unicode.Gothic;
2202         // Gothic letter ahsa
2203         assert(gothic['\U00010330']);
2204         // no ascii in Gothic obviously
2205         assert(!gothic['$']);
2206     }
2207 
2208 
2209     // Linear scan for `ch`. Useful only for small sets.
2210     // TODO:
2211     // used internally in std.regex
2212     // should be properly exposed in a public API ?
2213     package(std) auto scanFor()(dchar ch) const
2214     {
2215         immutable len = data.length;
2216         for (size_t i = 0; i < len; i++)
2217             if (ch < data[i])
2218                 return i & 1;
2219         return 0;
2220     }
2221 
2222     /// Number of $(CODEPOINTS) in this set
2223     @property size_t length()
2224     {
2225         size_t sum = 0;
2226         foreach (iv; byInterval)
2227         {
2228             sum += iv.b - iv.a;
2229         }
2230         return sum;
2231     }
2232 
2233 // bootstrap full set operations from 4 primitives (suitable as a template mixin):
2234 // addInterval, skipUpTo, dropUpTo & byInterval iteration
2235 //============================================================================
2236 public:
2237     /**
2238         $(P Sets support natural syntax for set algebra, namely: )
2239         $(BOOKTABLE ,
2240             $(TR $(TH Operator) $(TH Math notation) $(TH Description) )
2241             $(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
2242             $(TR $(TD |) $(TD a ∪ b) $(TD union) )
2243             $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
2244             $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
2245         )
2246     */
2247     This opBinary(string op, U)(U rhs)
2248         if (isCodepointSet!U || is(U:dchar))
2249     {
2250         static if (op == "&" || op == "|" || op == "~")
2251         {// symmetric ops thus can swap arguments to reuse r-value
2252             static if (is(U:dchar))
2253             {
2254                 auto tmp = this;
2255                 mixin("tmp "~op~"= rhs; ");
2256                 return tmp;
2257             }
2258             else
2259             {
2260                 static if (is(Unqual!U == U))
2261                 {
2262                     // try hard to reuse r-value
2263                     mixin("rhs "~op~"= this;");
2264                     return rhs;
2265                 }
2266                 else
2267                 {
2268                     auto tmp = this;
2269                     mixin("tmp "~op~"= rhs;");
2270                     return tmp;
2271                 }
2272             }
2273         }
2274         else static if (op == "-") // anti-symmetric
2275         {
2276             auto tmp = this;
2277             tmp -= rhs;
2278             return tmp;
2279         }
2280         else
2281             static assert(0, "no operator "~op~" defined for Set");
2282     }
2283 
2284     ///
2285     pure @safe unittest
2286     {
2287         import std.algorithm.comparison : equal;
2288         import std.range : iota;
2289 
2290         auto lower = unicode.LowerCase;
2291         auto upper = unicode.UpperCase;
2292         auto ascii = unicode.ASCII;
2293 
2294         assert((lower & upper).empty); // no intersection
2295         auto lowerASCII = lower & ascii;
2296         assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
2297         // throw away all of the lowercase ASCII
2298         assert((ascii - lower).length == 128 - 26);
2299 
2300         auto onlyOneOf = lower ~ ascii;
2301         assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
2302         assert(onlyOneOf['$']); // ASCII and not lowercase
2303         assert(!onlyOneOf['a']); // ASCII and lowercase
2304         assert(onlyOneOf['я']); // not ASCII but lowercase
2305 
2306         // throw away all cased letters from ASCII
2307         auto noLetters = ascii - (lower | upper);
2308         assert(noLetters.length == 128 - 26*2);
2309     }
2310 
2311     /// The 'op=' versions of the above overloaded operators.
2312     ref This opOpAssign(string op, U)(U rhs)
2313         if (isCodepointSet!U || is(U:dchar))
2314     {
2315         static if (op == "|")    // union
2316         {
2317             static if (is(U:dchar))
2318             {
2319                 this.addInterval(rhs, rhs+1);
2320                 return this;
2321             }
2322             else
2323                 return this.add(rhs);
2324         }
2325         else static if (op == "&")   // intersection
2326                 return this.intersect(rhs);// overloaded
2327         else static if (op == "-")   // set difference
2328                 return this.sub(rhs);// overloaded
2329         else static if (op == "~")   // symmetric set difference
2330         {
2331             auto copy = this & rhs;
2332             this |= rhs;
2333             this -= copy;
2334             return this;
2335         }
2336         else
2337             static assert(0, "no operator "~op~" defined for Set");
2338     }
2339 
2340     /**
2341         Tests the presence of codepoint `ch` in this set,
2342         the same as $(LREF opIndex).
2343     */
2344     bool opBinaryRight(string op: "in", U)(U ch) const
2345         if (is(U : dchar))
2346     {
2347         return this[ch];
2348     }
2349 
2350     ///
2351     pure @safe unittest
2352     {
2353         assert('я' in unicode.Cyrillic);
2354         assert(!('z' in unicode.Cyrillic));
2355     }
2356 
2357 
2358 
2359     /**
2360      * Obtains a set that is the inversion of this set.
2361      *
2362      * See_Also: $(LREF inverted)
2363      */
2364     auto opUnary(string op: "!")()
2365     {
2366         return this.inverted;
2367     }
2368 
2369     /**
2370         A range that spans each $(CODEPOINT) in this set.
2371     */
2372     @property auto byCodepoint()
2373     {
2374         static struct CodepointRange
2375         {
2376             this(This set)
2377             {
2378                 r = set.byInterval;
2379                 if (!r.empty)
2380                     cur = r.front.a;
2381             }
2382 
2383             @property dchar front() const
2384             {
2385                 return cast(dchar) cur;
2386             }
2387 
2388             @property bool empty() const
2389             {
2390                 return r.empty;
2391             }
2392 
2393             void popFront()
2394             {
2395                 cur++;
2396                 while (cur >= r.front.b)
2397                 {
2398                     r.popFront();
2399                     if (r.empty)
2400                         break;
2401                     cur = r.front.a;
2402                 }
2403             }
2404         private:
2405             uint cur;
2406             typeof(This.init.byInterval) r;
2407         }
2408 
2409         return CodepointRange(this);
2410     }
2411 
2412     ///
2413     pure @safe unittest
2414     {
2415         import std.algorithm.comparison : equal;
2416         import std.range : iota;
2417 
2418         auto set = unicode.ASCII;
2419         set.byCodepoint.equal(iota(0, 0x80));
2420     }
2421 
2422     /**
2423         $(P Obtain textual representation of this set in from of
2424         open-right intervals and feed it to `sink`.
2425         )
2426         $(P Used by various standard formatting facilities such as
2427          $(REF formattedWrite, std,format), $(REF write, std,stdio),
2428          $(REF writef, std,stdio), $(REF to, std,conv) and others.
2429         )
2430         Example:
2431         ---
2432         import std.conv;
2433         assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
2434         ---
2435     */
2436 
2437     private import std.format.spec : FormatSpec;
2438 
2439     /***************************************
2440      * Obtain a textual representation of this InversionList
2441      * in form of open-right intervals.
2442      *
2443      * The formatting flag is applied individually to each value, for example:
2444      * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
2445      * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
2446      * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
2447      */
2448     void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */
2449     {
2450         import std.format.write : formatValue;
2451         auto range = byInterval;
2452         if (range.empty)
2453             return;
2454 
2455         while (1)
2456         {
2457             auto i = range.front;
2458             range.popFront();
2459 
2460             put(sink, "[");
2461             formatValue(sink, i.a, fmt);
2462             put(sink, "..");
2463             formatValue(sink, i.b, fmt);
2464             put(sink, ")");
2465             if (range.empty) return;
2466             put(sink, " ");
2467         }
2468     }
2469 
2470     ///
2471     pure @safe unittest
2472     {
2473         import std.conv : to;
2474         import std.format : format;
2475         import std.uni : unicode;
2476 
2477         // This was originally using Cyrillic script.
2478         // Unfortunately this is a pretty active range for changes,
2479         // and hence broke in an update.
2480         // Therefore the range Basic latin was used instead as it
2481         // unlikely to ever change.
2482 
2483         assert(unicode.InBasic_latin.to!string == "[0..128)");
2484 
2485         // The specs '%s' and '%d' are equivalent to the to!string call above.
2486         assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string);
2487 
2488         assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)");
2489         assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)");
2490     }
2491 
2492     pure @safe unittest
2493     {
2494         import std.exception : assertThrown;
2495         import std.format : format, FormatException;
2496         assertThrown!FormatException(format("%z", unicode.ASCII));
2497     }
2498 
2499 
2500     /**
2501         Add an interval [a, b$(RPAREN) to this set.
2502     */
2503     ref add()(uint a, uint b)
2504     {
2505         addInterval(a, b);
2506         return this;
2507     }
2508 
2509     ///
2510     pure @safe unittest
2511     {
2512         CodepointSet someSet;
2513         someSet.add('0', '5').add('A','Z'+1);
2514         someSet.add('5', '9'+1);
2515         assert(someSet['0']);
2516         assert(someSet['5']);
2517         assert(someSet['9']);
2518         assert(someSet['Z']);
2519     }
2520 
2521 private:
2522 
2523   package(std)  // used from: std.regex.internal.parser
2524     ref intersect(U)(U rhs)
2525         if (isCodepointSet!U)
2526     {
2527         Marker mark;
2528         foreach ( i; rhs.byInterval)
2529         {
2530             mark = this.dropUpTo(i.a, mark);
2531             mark = this.skipUpTo(i.b, mark);
2532         }
2533         this.dropUpTo(uint.max, mark);
2534         return this;
2535     }
2536 
2537     ref intersect()(dchar ch)
2538     {
2539         foreach (i; byInterval)
2540             if (i.a <= ch && ch < i.b)
2541                 return this = This.init.add(ch, ch+1);
2542         this = This.init;
2543         return this;
2544     }
2545 
2546     pure @safe unittest
2547     {
2548         assert(unicode.Cyrillic.intersect('-').byInterval.empty);
2549     }
2550 
2551     ref sub()(dchar ch)
2552     {
2553         return subChar(ch);
2554     }
2555 
2556     // same as the above except that skip & drop parts are swapped
2557   package(std)  // used from: std.regex.internal.parser
2558     ref sub(U)(U rhs)
2559         if (isCodepointSet!U)
2560     {
2561         Marker mark;
2562         foreach (i; rhs.byInterval)
2563         {
2564             mark = this.skipUpTo(i.a, mark);
2565             mark = this.dropUpTo(i.b, mark);
2566         }
2567         return this;
2568     }
2569 
2570   package(std)  // used from: std.regex.internal.parse
2571     ref add(U)(U rhs)
2572         if (isCodepointSet!U)
2573     {
2574         Marker start;
2575         foreach (i; rhs.byInterval)
2576         {
2577             start = addInterval(i.a, i.b, start);
2578         }
2579         return this;
2580     }
2581 
2582 // end of mixin-able part
2583 //============================================================================
2584 public:
2585     /**
2586         Obtains a set that is the inversion of this set.
2587 
2588         See the '!' $(LREF opUnary) for the same but using operators.
2589     */
2590     @property auto inverted()
2591     {
2592         InversionList inversion = this;
2593         if (inversion.data.length == 0)
2594         {
2595             inversion.addInterval(0, lastDchar+1);
2596             return inversion;
2597         }
2598         if (inversion.data[0] != 0)
2599             genericReplace(inversion.data, 0, 0, [0]);
2600         else
2601             genericReplace(inversion.data, 0, 1, cast(uint[]) null);
2602         if (data[data.length-1] != lastDchar+1)
2603             genericReplace(inversion.data,
2604                 inversion.data.length, inversion.data.length, [lastDchar+1]);
2605         else
2606             genericReplace(inversion.data,
2607                 inversion.data.length-1, inversion.data.length, cast(uint[]) null);
2608 
2609         return inversion;
2610     }
2611 
2612     ///
2613     pure @safe unittest
2614     {
2615         auto set = unicode.ASCII;
2616         // union with the inverse gets all of the code points in the Unicode
2617         assert((set | set.inverted).length == 0x110000);
2618         // no intersection with the inverse
2619         assert((set & set.inverted).empty);
2620     }
2621 
2622     package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName)
2623     {
2624         import std.algorithm.searching : countUntil;
2625         import std.format : format;
2626         enum maxBinary = 3;
2627         static string linearScope(R)(R ivals, string indent)
2628         {
2629             string result = indent~"{\n";
2630             string deeper = indent~"    ";
2631             foreach (ival; ivals)
2632             {
2633                 immutable span = ival[1] - ival[0];
2634                 assert(span != 0);
2635                 if (span == 1)
2636                 {
2637                     result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
2638                 }
2639                 else if (span == 2)
2640                 {
2641                     result ~= format("%sif (ch == %s || ch == %s) return true;\n",
2642                         deeper, ival[0], ival[0]+1);
2643                 }
2644                 else
2645                 {
2646                     if (ival[0] != 0) // dchar is unsigned and  < 0 is useless
2647                         result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
2648                     result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
2649                 }
2650             }
2651             result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
2652             return result;
2653         }
2654 
2655         static string binaryScope(R)(R ivals, string indent) @safe
2656         {
2657             // time to do unrolled comparisons?
2658             if (ivals.length < maxBinary)
2659                 return linearScope(ivals, indent);
2660             else
2661                 return bisect(ivals, ivals.length/2, indent);
2662         }
2663 
2664         // not used yet if/elsebinary search is far better with DMD  as of 2.061
2665         // and GDC is doing fine job either way
2666         static string switchScope(R)(R ivals, string indent)
2667         {
2668             string result = indent~"switch (ch){\n";
2669             string deeper = indent~"    ";
2670             foreach (ival; ivals)
2671             {
2672                 if (ival[0]+1 == ival[1])
2673                 {
2674                     result ~= format("%scase %s: return true;\n",
2675                         deeper, ival[0]);
2676                 }
2677                 else
2678                 {
2679                     result ~= format("%scase %s: .. case %s: return true;\n",
2680                          deeper, ival[0], ival[1]-1);
2681                 }
2682             }
2683             result ~= deeper~"default: return false;\n"~indent~"}\n";
2684             return result;
2685         }
2686 
2687         static string bisect(R)(R range, size_t idx, string indent)
2688         {
2689             string deeper = indent ~ "    ";
2690             // bisect on one [a, b) interval at idx
2691             string result = indent~"{\n";
2692             // less branch, < a
2693             result ~= format("%sif (ch < %s)\n%s",
2694                 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
2695             // middle point,  >= a && < b
2696             result ~= format("%selse if (ch < %s) return true;\n",
2697                 deeper, range[idx][1]);
2698             // greater or equal branch,  >= b
2699             result ~= format("%selse\n%s",
2700                 deeper, binaryScope(range[idx+1..$], deeper));
2701             return result~indent~"}\n";
2702         }
2703 
2704         string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
2705             funcName.empty ? "function" : funcName);
2706         // special case first bisection to be on ASCII vs beyond
2707         auto tillAscii = countUntil!"a[0] > 0x80"(range);
2708         if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
2709             code ~= binaryScope(range, "");
2710         else
2711             code ~= bisect(range, tillAscii, "");
2712         return code;
2713     }
2714 
2715     /**
2716         Generates string with D source code of unary function with name of
2717         `funcName` taking a single `dchar` argument. If `funcName` is empty
2718         the code is adjusted to be a lambda function.
2719 
2720         The function generated tests if the $(CODEPOINT) passed
2721         belongs to this set or not. The result is to be used with string mixin.
2722         The intended usage area is aggressive optimization via meta programming
2723         in parser generators and the like.
2724 
2725         Note: Use with care for relatively small or regular sets. It
2726         could end up being slower then just using multi-staged tables.
2727 
2728         Example:
2729         ---
2730         import std.stdio;
2731 
2732         // construct set directly from [a, b$RPAREN intervals
2733         auto set = CodepointSet(10, 12, 45, 65, 100, 200);
2734         writeln(set);
2735         writeln(set.toSourceCode("func"));
2736         ---
2737 
2738         The above outputs something along the lines of:
2739         ---
2740         bool func(dchar ch)  @safe pure nothrow @nogc
2741         {
2742             if (ch < 45)
2743             {
2744                 if (ch == 10 || ch == 11) return true;
2745                 return false;
2746             }
2747             else if (ch < 65) return true;
2748             else
2749             {
2750                 if (ch < 100) return false;
2751                 if (ch < 200) return true;
2752                 return false;
2753             }
2754         }
2755         ---
2756     */
2757     string toSourceCode(string funcName="")
2758     {
2759         import std.array : array;
2760         auto range = byInterval.array();
2761         return toSourceCode(range, funcName);
2762     }
2763 
2764     /**
2765         True if this set doesn't contain any $(CODEPOINTS).
2766     */
2767     @property bool empty() const
2768     {
2769         return data.length == 0;
2770     }
2771 
2772     ///
2773     pure @safe unittest
2774     {
2775         CodepointSet emptySet;
2776         assert(emptySet.length == 0);
2777         assert(emptySet.empty);
2778     }
2779 
2780 private:
2781     alias This = typeof(this);
2782     alias Marker = size_t;
2783 
2784     // a random-access range of integral pairs
2785     static struct Intervals(Range)
2786     {
2787         import std.range.primitives : hasAssignableElements;
2788 
2789         this(Range sp) scope
2790         {
2791             slice = sp;
2792             start = 0;
2793             end = sp.length;
2794         }
2795 
2796         this(Range sp, size_t s, size_t e) scope
2797         {
2798             slice = sp;
2799             start = s;
2800             end = e;
2801         }
2802 
2803         @property auto front()const
2804         {
2805             immutable a = slice[start];
2806             immutable b = slice[start+1];
2807             return CodepointInterval(a, b);
2808         }
2809 
2810         //may break sorted property - but we need std.sort to access it
2811         //hence package(std) protection attribute
2812         static if (hasAssignableElements!Range)
2813         package(std) @property void front(CodepointInterval val)
2814         {
2815             slice[start] = val.a;
2816             slice[start+1] = val.b;
2817         }
2818 
2819         @property auto back()const
2820         {
2821             immutable a = slice[end-2];
2822             immutable b = slice[end-1];
2823             return CodepointInterval(a, b);
2824         }
2825 
2826         //ditto about package
2827         static if (hasAssignableElements!Range)
2828         package(std) @property void back(CodepointInterval val)
2829         {
2830             slice[end-2] = val.a;
2831             slice[end-1] = val.b;
2832         }
2833 
2834         void popFront()
2835         {
2836             start += 2;
2837         }
2838 
2839         void popBack()
2840         {
2841             end -= 2;
2842         }
2843 
2844         auto opIndex(size_t idx) const
2845         {
2846             immutable a = slice[start+idx*2];
2847             immutable b = slice[start+idx*2+1];
2848             return CodepointInterval(a, b);
2849         }
2850 
2851         //ditto about package
2852         static if (hasAssignableElements!Range)
2853         package(std) void opIndexAssign(CodepointInterval val, size_t idx)
2854         {
2855             slice[start+idx*2] = val.a;
2856             slice[start+idx*2+1] = val.b;
2857         }
2858 
2859         auto opSlice(size_t s, size_t e)
2860         {
2861             return Intervals(slice, s*2+start, e*2+start);
2862         }
2863 
2864         @property size_t length()const {  return slice.length/2; }
2865 
2866         @property bool empty()const { return start == end; }
2867 
2868         @property auto save(){ return this; }
2869     private:
2870         size_t start, end;
2871         Range slice;
2872     }
2873 
2874     // called after construction from intervals
2875     // to make sure invariants hold
2876     void sanitize()
2877     {
2878         import std.algorithm.comparison : max;
2879         import std.algorithm.mutation : SwapStrategy;
2880         import std.algorithm.sorting : sort;
2881         if (data.length == 0)
2882             return;
2883         alias Ival = CodepointInterval;
2884         //intervals wrapper for a _range_ over packed array
2885         auto ivals = Intervals!(typeof(data[]))(data[]);
2886         //@@@BUG@@@ can't use "a.a < b.a" see
2887         // https://issues.dlang.org/show_bug.cgi?id=12265
2888         sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
2889         // what follows is a variation on stable remove
2890         // differences:
2891         // - predicate is binary, and is tested against
2892         //   the last kept element (at 'i').
2893         // - predicate mutates lhs (merges rhs into lhs)
2894         size_t len = ivals.length;
2895         size_t i = 0;
2896         size_t j = 1;
2897         while (j < len)
2898         {
2899             if (ivals[i].b >= ivals[j].a)
2900             {
2901                 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
2902                 j++;
2903             }
2904             else //unmergable
2905             {
2906                 // check if there is a hole after merges
2907                 // (in the best case we do 0 writes to ivals)
2908                 if (j != i+1)
2909                     ivals[i+1] = ivals[j]; //copy over
2910                 i++;
2911                 j++;
2912             }
2913         }
2914         len = i + 1;
2915         for (size_t k=0; k + 1 < len; k++)
2916         {
2917             assert(ivals[k].a < ivals[k].b);
2918             assert(ivals[k].b < ivals[k+1].a);
2919         }
2920         data.length = len * 2;
2921     }
2922 
2923     // special case for normal InversionList
2924     ref subChar(dchar ch)
2925     {
2926         auto mark = skipUpTo(ch);
2927         if (mark != data.length
2928             && data[mark] == ch && data[mark-1] == ch)
2929         {
2930             // it has split, meaning that ch happens to be in one of intervals
2931             data[mark] = data[mark]+1;
2932         }
2933         return this;
2934     }
2935 
2936     //
2937     Marker addInterval(int a, int b, Marker hint=Marker.init) scope
2938     in
2939     {
2940         assert(a <= b);
2941     }
2942     do
2943     {
2944         import std.range : assumeSorted, SearchPolicy;
2945         auto range = assumeSorted(data[]);
2946         size_t pos;
2947         size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
2948         if (a_idx == range.length)
2949         {
2950             //  [---+++----++++----++++++]
2951             //  [                         a  b]
2952             data.append(a, b);
2953             return data.length-1;
2954         }
2955         size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
2956         uint[3] buf = void;
2957         uint to_insert;
2958         debug(std_uni)
2959         {
2960             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2961         }
2962         if (b_idx == range.length)
2963         {
2964             //  [-------++++++++----++++++-]
2965             //  [      s     a                 b]
2966             if (a_idx & 1)// a in positive
2967             {
2968                 buf[0] = b;
2969                 to_insert = 1;
2970             }
2971             else// a in negative
2972             {
2973                 buf[0] = a;
2974                 buf[1] = b;
2975                 to_insert = 2;
2976             }
2977             pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
2978             return pos - 1;
2979         }
2980 
2981         uint top = data[b_idx];
2982 
2983         debug(std_uni)
2984         {
2985             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2986             writefln("a=%s; b=%s; top=%s;", a, b, top);
2987         }
2988         if (a_idx & 1)
2989         {// a in positive
2990             if (b_idx & 1)// b in positive
2991             {
2992                 //  [-------++++++++----++++++-]
2993                 //  [       s    a        b    ]
2994                 buf[0] = top;
2995                 to_insert = 1;
2996             }
2997             else // b in negative
2998             {
2999                 //  [-------++++++++----++++++-]
3000                 //  [       s    a   b         ]
3001                 if (top == b)
3002                 {
3003                     assert(b_idx+1 < data.length);
3004                     buf[0] = data[b_idx+1];
3005                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
3006                     return pos - 1;
3007                 }
3008                 buf[0] = b;
3009                 buf[1] = top;
3010                 to_insert = 2;
3011             }
3012         }
3013         else
3014         { // a in negative
3015             if (b_idx & 1) // b in positive
3016             {
3017                 //  [----------+++++----++++++-]
3018                 //  [     a     b              ]
3019                 buf[0] = a;
3020                 buf[1] = top;
3021                 to_insert = 2;
3022             }
3023             else// b in negative
3024             {
3025                 //  [----------+++++----++++++-]
3026                 //  [  a       s      b        ]
3027                 if (top == b)
3028                 {
3029                     assert(b_idx+1 < data.length);
3030                     buf[0] = a;
3031                     buf[1] = data[b_idx+1];
3032                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
3033                     return pos - 1;
3034                 }
3035                 buf[0] = a;
3036                 buf[1] = b;
3037                 buf[2] = top;
3038                 to_insert = 3;
3039             }
3040         }
3041         pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
3042         debug(std_uni)
3043         {
3044             writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
3045             writeln("inserting ", buf[0 .. to_insert]);
3046         }
3047         return pos - 1;
3048     }
3049 
3050     //
3051     Marker dropUpTo(uint a, Marker pos=Marker.init)
3052     in
3053     {
3054         assert(pos % 2 == 0); // at start of interval
3055     }
3056     do
3057     {
3058         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3059         if (range.empty)
3060             return pos;
3061         size_t idx = pos;
3062         idx += range.lowerBound(a).length;
3063 
3064         debug(std_uni)
3065         {
3066             writeln("dropUpTo full length=", data.length);
3067             writeln(pos,"~~~", idx);
3068         }
3069         if (idx == data.length)
3070             return genericReplace(data, pos, idx, cast(uint[])[]);
3071         if (idx & 1)
3072         {   // a in positive
3073             //[--+++----++++++----+++++++------...]
3074             //      |<---si       s  a  t
3075             genericReplace(data, pos, idx, [a]);
3076         }
3077         else
3078         {   // a in negative
3079             //[--+++----++++++----+++++++-------+++...]
3080             //      |<---si              s  a  t
3081             genericReplace(data, pos, idx, cast(uint[])[]);
3082         }
3083         return pos;
3084     }
3085 
3086     //
3087     Marker skipUpTo(uint a, Marker pos=Marker.init)
3088     out(result)
3089     {
3090         assert(result % 2 == 0);// always start of interval
3091         //(may be  0-width after-split)
3092     }
3093     do
3094     {
3095         assert(data.length % 2 == 0);
3096         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3097         size_t idx = pos+range.lowerBound(a).length;
3098 
3099         if (idx >= data.length) // could have Marker point to recently removed stuff
3100             return data.length;
3101 
3102         if (idx & 1)// inside of interval, check for split
3103         {
3104 
3105             immutable top = data[idx];
3106             if (top == a)// no need to split, it's end
3107                 return idx+1;
3108             immutable start = data[idx-1];
3109             if (a == start)
3110                 return idx-1;
3111             // split it up
3112             genericReplace(data, idx, idx+1, [a, a, top]);
3113             return idx+1;        // avoid odd index
3114         }
3115         return idx;
3116     }
3117 
3118     CowArray!SP data;
3119 }
3120 
3121 pure @safe unittest
3122 {
3123     import std.conv : to;
3124     assert(unicode.ASCII.to!string() == "[0..128)");
3125 }
3126 
3127 // pedantic version for ctfe, and aligned-access only architectures
3128 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3129 {
3130     idx *= 3;
3131     version (LittleEndian)
3132         return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
3133              + (cast(uint) ptr[idx+2]<<16);
3134     else
3135         return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
3136              + ptr[idx+2];
3137 }
3138 
3139 // ditto
3140 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3141 {
3142     idx *= 3;
3143     version (LittleEndian)
3144     {
3145         ptr[idx] = val & 0xFF;
3146         ptr[idx+1] = (val >> 8) & 0xFF;
3147         ptr[idx+2] = (val >> 16) & 0xFF;
3148     }
3149     else
3150     {
3151         ptr[idx] = (val >> 16) & 0xFF;
3152         ptr[idx+1] = (val >> 8) & 0xFF;
3153         ptr[idx+2] = val & 0xFF;
3154     }
3155 }
3156 
3157 // unaligned x86-like read/write functions
3158 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3159 {
3160     uint* src = cast(uint*)(ptr+3*idx);
3161     version (LittleEndian)
3162         return *src & 0xFF_FFFF;
3163     else
3164         return *src >> 8;
3165 }
3166 
3167 // ditto
3168 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3169 {
3170     uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
3171     version (LittleEndian)
3172         *dest = val | (*dest & 0xFF00_0000);
3173     else
3174         *dest = (val << 8) | (*dest & 0xFF);
3175 }
3176 
3177 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3178 {
3179     static if (hasUnalignedReads)
3180         return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
3181     else
3182         return safeRead24(ptr, idx);
3183 }
3184 
3185 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3186 {
3187     static if (hasUnalignedReads)
3188         return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
3189     else
3190         return safeWrite24(ptr, val, idx);
3191 }
3192 
3193 struct CowArray(SP=GcPolicy)
3194 {
3195     import std.range.primitives : hasLength;
3196 
3197   @safe:
3198     static auto reuse(uint[] arr)
3199     {
3200         CowArray cow;
3201         cow.data = arr;
3202         SP.append(cow.data, 1);
3203         assert(cow.refCount == 1);
3204         assert(cow.length == arr.length);
3205         return cow;
3206     }
3207 
3208     this(Range)(Range range)
3209         if (isInputRange!Range && hasLength!Range)
3210     {
3211         import std.algorithm.mutation : copy;
3212         length = range.length;
3213         copy(range, data[0..$-1]);
3214     }
3215 
3216     this(Range)(Range range)
3217         if (isForwardRange!Range && !hasLength!Range)
3218     {
3219         import std.algorithm.mutation : copy;
3220         import std.range.primitives : walkLength;
3221         immutable len = walkLength(range.save);
3222         length = len;
3223         copy(range, data[0..$-1]);
3224     }
3225 
3226     this(this)
3227     {
3228         if (!empty)
3229         {
3230             refCount = refCount + 1;
3231         }
3232     }
3233 
3234     ~this()
3235     {
3236         if (!empty)
3237         {
3238             immutable cnt = refCount;
3239             if (cnt == 1)
3240                 SP.destroy(data);
3241             else
3242                 refCount = cnt - 1;
3243         }
3244     }
3245 
3246     // no ref-count for empty U24 array
3247     @property bool empty() const { return data.length == 0; }
3248 
3249     // report one less then actual size
3250     @property size_t length() const
3251     {
3252         return data.length ? data.length - 1 : 0;
3253     }
3254 
3255     //+ an extra slot for ref-count
3256     @property void length(size_t len)
3257     {
3258         import std.algorithm.comparison : min;
3259         import std.algorithm.mutation : copy;
3260         if (len == 0)
3261         {
3262             if (!empty)
3263                 freeThisReference();
3264             return;
3265         }
3266         immutable total = len + 1; // including ref-count
3267         if (empty)
3268         {
3269             data = SP.alloc!uint(total);
3270             refCount = 1;
3271             return;
3272         }
3273         immutable cur_cnt = refCount;
3274         if (cur_cnt != 1) // have more references to this memory
3275         {
3276             refCount = cur_cnt - 1;
3277             auto new_data = SP.alloc!uint(total);
3278             // take shrinking into account
3279             auto to_copy = min(total, data.length) - 1;
3280             copy(data[0 .. to_copy], new_data[0 .. to_copy]);
3281             data = new_data; // before setting refCount!
3282             refCount = 1;
3283         }
3284         else // 'this' is the only reference
3285         {
3286             // use the realloc (hopefully in-place operation)
3287             data = SP.realloc(data, total);
3288             refCount = 1; // setup a ref-count in the new end of the array
3289         }
3290     }
3291 
3292     alias opDollar = length;
3293 
3294     uint opIndex()(size_t idx)const
3295     {
3296         return data[idx];
3297     }
3298 
3299     void opIndexAssign(uint val, size_t idx)
3300     {
3301         auto cnt = refCount;
3302         if (cnt != 1)
3303             dupThisReference(cnt);
3304         data[idx] = val;
3305     }
3306 
3307     //
3308     auto opSlice(size_t from, size_t to)
3309     {
3310         if (!empty)
3311         {
3312             auto cnt = refCount;
3313             if (cnt != 1)
3314                 dupThisReference(cnt);
3315         }
3316         return data[from .. to];
3317 
3318     }
3319 
3320     //
3321     auto opSlice(size_t from, size_t to) const
3322     {
3323         return data[from .. to];
3324     }
3325 
3326     // length slices before the ref count
3327     auto opSlice()
3328     {
3329         return opSlice(0, length);
3330     }
3331 
3332     // ditto
3333     auto opSlice() const
3334     {
3335         return opSlice(0, length);
3336     }
3337 
3338     void append(Range)(Range range)
3339         if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
3340     {
3341         size_t nl = length + range.length;
3342         length = nl;
3343         copy(range, this[nl-range.length .. nl]);
3344     }
3345 
3346     void append()(uint[] val...)
3347     {
3348         length = length + val.length;
3349         data[$-val.length-1 .. $-1] = val[];
3350     }
3351 
3352     bool opEquals()(auto ref const CowArray rhs) const
3353     {
3354         if (empty ^ rhs.empty)
3355             return false; // one is empty and the other isn't
3356         return empty || data[0..$-1] == rhs.data[0..$-1];
3357     }
3358 
3359 private:
3360     // ref-count is right after the data
3361     @property uint refCount() const
3362     {
3363         return data[$-1];
3364     }
3365 
3366     @property void refCount(uint cnt)
3367     {
3368         data[$-1] = cnt;
3369     }
3370 
3371     void freeThisReference()
3372     {
3373         immutable count = refCount;
3374         if (count != 1) // have more references to this memory
3375         {
3376             // dec shared ref-count
3377             refCount = count - 1;
3378             data = [];
3379         }
3380         else
3381             SP.destroy(data);
3382         assert(!data.ptr);
3383     }
3384 
3385     void dupThisReference(uint count)
3386     in
3387     {
3388         assert(!empty && count != 1 && count == refCount);
3389     }
3390     do
3391     {
3392         import std.algorithm.mutation : copy;
3393         // dec shared ref-count
3394         refCount = count - 1;
3395         // copy to the new chunk of RAM
3396         auto new_data = SP.alloc!uint(data.length);
3397         // bit-blit old stuff except the counter
3398         copy(data[0..$-1], new_data[0..$-1]);
3399         data = new_data; // before setting refCount!
3400         refCount = 1; // so that this updates the right one
3401     }
3402 
3403     uint[] data;
3404 }
3405 
3406 pure @safe unittest// Uint24 tests
3407 {
3408     import std.algorithm.comparison : equal;
3409     import std.algorithm.mutation : copy;
3410     import std.conv : text;
3411     import std.range : iota, chain;
3412     import std.range.primitives : isBidirectionalRange, isOutputRange;
3413     void funcRef(T)(ref T u24)
3414     {
3415         u24.length = 2;
3416         u24[1] = 1024;
3417         T u24_c = u24;
3418         assert(u24[1] == 1024);
3419         u24.length = 0;
3420         assert(u24.empty);
3421         u24.append([1, 2]);
3422         assert(equal(u24[], [1, 2]));
3423         u24.append(111);
3424         assert(equal(u24[], [1, 2, 111]));
3425         assert(!u24_c.empty && u24_c[1] == 1024);
3426         u24.length = 3;
3427         copy(iota(0, 3), u24[]);
3428         assert(equal(u24[], iota(0, 3)));
3429         assert(u24_c[1] == 1024);
3430     }
3431 
3432     void func2(T)(T u24)
3433     {
3434         T u24_2 = u24;
3435         T u24_3;
3436         u24_3 = u24_2;
3437         assert(u24_2 == u24_3);
3438         assert(equal(u24[], u24_2[]));
3439         assert(equal(u24_2[], u24_3[]));
3440         funcRef(u24_3);
3441 
3442         assert(equal(u24_3[], iota(0, 3)));
3443         assert(!equal(u24_2[], u24_3[]));
3444         assert(equal(u24_2[], u24[]));
3445         u24_2 = u24_3;
3446         assert(equal(u24_2[], iota(0, 3)));
3447         // to test that passed arg is intact outside
3448         // plus try out opEquals
3449         u24 = u24_3;
3450         u24 = T.init;
3451         u24_3 = T.init;
3452         assert(u24.empty);
3453         assert(u24 == u24_3);
3454         assert(u24 != u24_2);
3455     }
3456 
3457     static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
3458     {{
3459         alias Range = typeof(CowArray!Policy.init[]);
3460         alias U24A = CowArray!Policy;
3461         static assert(isForwardRange!Range);
3462         static assert(isBidirectionalRange!Range);
3463         static assert(isOutputRange!(Range, uint));
3464         static assert(isRandomAccessRange!(Range));
3465 
3466         auto arr = U24A([42u, 36, 100]);
3467         assert(arr[0] == 42);
3468         assert(arr[1] == 36);
3469         arr[0] = 72;
3470         arr[1] = 0xFE_FEFE;
3471         assert(arr[0] == 72);
3472         assert(arr[1] == 0xFE_FEFE);
3473         assert(arr[2] == 100);
3474         U24A arr2 = arr;
3475         assert(arr2[0] == 72);
3476         arr2[0] = 11;
3477         // test COW-ness
3478         assert(arr[0] == 72);
3479         assert(arr2[0] == 11);
3480         // set this to about 100M to stress-test COW memory management
3481         foreach (v; 0 .. 10_000)
3482             func2(arr);
3483         assert(equal(arr[], [72, 0xFE_FEFE, 100]));
3484 
3485         auto r2 = U24A(iota(0, 100));
3486         assert(equal(r2[], iota(0, 100)), text(r2[]));
3487         copy(iota(10, 170, 2), r2[10 .. 90]);
3488         assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
3489                , text(r2[]));
3490     }}
3491 }
3492 
3493 pure @safe unittest// core set primitives test
3494 {
3495     import std.conv : text;
3496     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3497     foreach (CodeList; AllSets)
3498     {
3499         CodeList a;
3500         //"plug a hole" test
3501         a.add(10, 20).add(25, 30).add(15, 27);
3502         assert(a == CodeList(10, 30), text(a));
3503 
3504         auto x = CodeList.init;
3505         x.add(10, 20).add(30, 40).add(50, 60);
3506 
3507         a = x;
3508         a.add(20, 49);//[10, 49) [50, 60)
3509         assert(a == CodeList(10, 49, 50 ,60));
3510 
3511         a = x;
3512         a.add(20, 50);
3513         assert(a == CodeList(10, 60), text(a));
3514 
3515         // simple unions, mostly edge effects
3516         x = CodeList.init;
3517         x.add(10, 20).add(40, 60);
3518 
3519         a = x;
3520         a.add(10, 25); //[10, 25) [40, 60)
3521         assert(a == CodeList(10, 25, 40, 60));
3522 
3523         a = x;
3524         a.add(5, 15); //[5, 20) [40, 60)
3525         assert(a == CodeList(5, 20, 40, 60));
3526 
3527         a = x;
3528         a.add(0, 10); // [0, 20) [40, 60)
3529         assert(a == CodeList(0, 20, 40, 60));
3530 
3531         a = x;
3532         a.add(0, 5); // prepand
3533         assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
3534 
3535         a = x;
3536         a.add(5, 20);
3537         assert(a == CodeList(5, 20, 40, 60));
3538 
3539         a = x;
3540         a.add(3, 37);
3541         assert(a == CodeList(3, 37, 40, 60));
3542 
3543         a = x;
3544         a.add(37, 65);
3545         assert(a == CodeList(10, 20, 37, 65));
3546 
3547         // some tests on helpers for set intersection
3548         x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
3549         a = x;
3550 
3551         auto m = a.skipUpTo(60);
3552         a.dropUpTo(110, m);
3553         assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
3554 
3555         a = x;
3556         a.dropUpTo(100);
3557         assert(a == CodeList(100, 120), text(a.data[]));
3558 
3559         a = x;
3560         m = a.skipUpTo(50);
3561         a.dropUpTo(140, m);
3562         assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
3563         a = x;
3564         a.dropUpTo(60);
3565         assert(a == CodeList(100, 120), text(a.data[]));
3566     }
3567 }
3568 
3569 
3570 //test constructor to work with any order of intervals
3571 pure @safe unittest
3572 {
3573     import std.algorithm.comparison : equal;
3574     import std.conv : text, to;
3575     import std.range : chain, iota;
3576     import std.typecons : tuple;
3577     //ensure constructor handles bad ordering and overlap
3578     auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
3579     foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
3580         assert(ch in c1, to!string(ch));
3581 
3582     //contiguos
3583     assert(CodepointSet(1000, 1006, 1006, 1009)
3584         .byInterval.equal([tuple(1000, 1009)]));
3585     //contains
3586     assert(CodepointSet(900, 1200, 1000, 1100)
3587         .byInterval.equal([tuple(900, 1200)]));
3588     //intersect left
3589     assert(CodepointSet(900, 1100, 1000, 1200)
3590         .byInterval.equal([tuple(900, 1200)]));
3591     //intersect right
3592     assert(CodepointSet(1000, 1200, 900, 1100)
3593         .byInterval.equal([tuple(900, 1200)]));
3594 
3595     //ditto with extra items at end
3596     assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
3597         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3598     assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
3599         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3600 
3601     //"plug a hole" test
3602     auto c2 = CodepointSet(20, 40,
3603         60, 80, 100, 140, 150, 200,
3604         40, 60, 80, 100, 140, 150
3605     );
3606     assert(c2.byInterval.equal([tuple(20, 200)]));
3607 
3608     auto c3 = CodepointSet(
3609         20, 40, 60, 80, 100, 140, 150, 200,
3610         0, 10, 15, 100, 10, 20, 200, 220);
3611     assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
3612 }
3613 
3614 
3615 pure @safe unittest
3616 {   // full set operations
3617     import std.conv : text;
3618     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3619     foreach (CodeList; AllSets)
3620     {
3621         CodeList a, b, c, d;
3622 
3623         //"plug a hole"
3624         a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3625         b.add(40, 60).add(80, 100).add(140, 150);
3626         c = a | b;
3627         d = b | a;
3628         assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
3629         assert(c == d, text(c," vs ", d));
3630 
3631         b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
3632         c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
3633         d = b | a;
3634         assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
3635         assert(c == d, text(c," vs ", d));
3636 
3637         b = CodeList.init.add(10, 20).add(30,100).add(145,200);
3638         c = a | b;//[10, 140) [145, 200)
3639         d = b | a;
3640         assert(c == CodeList(10, 140, 145, 200));
3641         assert(c == d, text(c," vs ", d));
3642 
3643         b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
3644         c = a | b;//[0, 140) [150, 220)
3645         d = b | a;
3646         assert(c == CodeList(0, 140, 150, 220));
3647         assert(c == d, text(c," vs ", d));
3648 
3649 
3650         a = CodeList.init.add(20, 40).add(60, 80);
3651         b = CodeList.init.add(25, 35).add(65, 75);
3652         c = a & b;
3653         d = b & a;
3654         assert(c == CodeList(25, 35, 65, 75), text(c));
3655         assert(c == d, text(c," vs ", d));
3656 
3657         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3658         b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
3659         c = a & b;
3660         d = b & a;
3661         assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
3662         assert(c == d, text(c," vs ", d));
3663 
3664         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3665         b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
3666         c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
3667         d = b & a;
3668 
3669         assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
3670         assert(c == d, text(c, " vs ",d));
3671         assert((c & a) == c);
3672         assert((d & b) == d);
3673         assert((c & d) == d);
3674 
3675         b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
3676         c = a & b;
3677         d = b & a;
3678         assert(c == CodeList(150, 200), text(c));
3679         assert(c == d, text(c, " vs ",d));
3680         assert((c & a) == c);
3681         assert((d & b) == d);
3682         assert((c & d) == d);
3683 
3684         assert((a & a) == a);
3685         assert((b & b) == b);
3686 
3687         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3688         b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
3689         c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
3690         d = b - a;// [40, 60) [80, 100) [200, 300)
3691         assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
3692         assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
3693         assert(c - d == c, text(c-d, " vs ", c));
3694         assert(d - c == d, text(d-c, " vs ", d));
3695         assert(c - c == CodeList.init);
3696         assert(d - d == CodeList.init);
3697 
3698         a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150,            200);
3699         b = CodeList.init.add(10,  50).add(60,                           160).add(190, 300);
3700         c = a - b;// [160, 190)
3701         d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
3702         assert(c == CodeList(160, 190), text(c));
3703         assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
3704         assert(c - d == c, text(c-d, " vs ", c));
3705         assert(d - c == d, text(d-c, " vs ", d));
3706         assert(c - c == CodeList.init);
3707         assert(d - d == CodeList.init);
3708 
3709         a = CodeList.init.add(20,    40).add(60, 80).add(100,      140).add(150,  200);
3710         b = CodeList.init.add(10, 30).add(45,         100).add(130,             190);
3711         c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
3712         d = b ~ a;
3713         assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
3714                text(c));
3715         assert(c == d, text(c, " vs ", d));
3716     }
3717 }
3718 
3719 }
3720 
3721 pure @safe unittest// vs single dchar
3722 {
3723     import std.conv : text;
3724     CodepointSet a = CodepointSet(10, 100, 120, 200);
3725     assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
3726     assert((a & 'B') == CodepointSet(66, 67));
3727 }
3728 
3729 pure @safe unittest// iteration & opIndex
3730 {
3731     import std.algorithm.comparison : equal;
3732     import std.conv : text;
3733     import std.typecons : tuple, Tuple;
3734 
3735     static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy)))
3736     {{
3737         auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
3738         auto a = CodeList('A','N','a', 'n');
3739         assert(equal(a.byInterval,
3740                 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
3741             ), text(a.byInterval));
3742 
3743         // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ?
3744         version (bug8949)
3745         {
3746             import std.range : retro;
3747             assert(equal(retro(a.byInterval),
3748                 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
3749             ), text(retro(a.byInterval)));
3750         }
3751         auto achr = a.byCodepoint;
3752         assert(equal(achr, arr), text(a.byCodepoint));
3753         foreach (ch; a.byCodepoint)
3754             assert(a[ch]);
3755         auto x = CodeList(100, 500, 600, 900, 1200, 1500);
3756         assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
3757         foreach (ch; x.byCodepoint)
3758             assert(x[ch]);
3759         static if (is(CodeList == CodepointSet))
3760         {
3761             auto y = CodeList(x.byInterval);
3762             assert(equal(x.byInterval, y.byInterval));
3763         }
3764         assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
3765         assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[]));
3766     }}
3767 }
3768 
3769 //============================================================================
3770 // Generic Trie template and various ways to build it
3771 //============================================================================
3772 
3773 // debug helper to get a shortened array dump
3774 auto arrayRepr(T)(T x)
3775 {
3776     import std.conv : text;
3777     if (x.length > 32)
3778     {
3779         return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]);
3780     }
3781     else
3782         return text(x);
3783 }
3784 
3785 /**
3786     Maps `Key` to a suitable integer index within the range of `size_t`.
3787     The mapping is constructed by applying predicates from `Prefix` left to right
3788     and concatenating the resulting bits.
3789 
3790     The first (leftmost) predicate defines the most significant bits of
3791     the resulting index.
3792  */
3793 template mapTrieIndex(Prefix...)
3794 {
3795     size_t mapTrieIndex(Key)(Key key)
3796         if (isValidPrefixForTrie!(Key, Prefix))
3797     {
3798         alias p = Prefix;
3799         size_t idx;
3800         foreach (i, v; p[0..$-1])
3801         {
3802             idx |= p[i](key);
3803             idx <<= p[i+1].bitSize;
3804         }
3805         idx |= p[$-1](key);
3806         return idx;
3807     }
3808 }
3809 
3810 /*
3811     `TrieBuilder` is a type used for incremental construction
3812     of $(LREF Trie)s.
3813 
3814     See $(LREF buildTrie) for generic helpers built on top of it.
3815 */
3816 @trusted private struct TrieBuilder(Value, Key, Args...)
3817 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
3818 {
3819     import std.exception : enforce;
3820 
3821 private:
3822     // last index is not stored in table, it is used as an offset to values in a block.
3823     static if (is(Value == bool))// always pack bool
3824         alias V = BitPacked!(Value, 1);
3825     else
3826         alias V = Value;
3827     static auto deduceMaxIndex(Preds...)()
3828     {
3829         size_t idx = 1;
3830         foreach (v; Preds)
3831             idx *= 2^^v.bitSize;
3832         return idx;
3833     }
3834 
3835     static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
3836     {
3837         alias Prefix = Args[1..$];
3838         enum lastPageSize = 2^^Prefix[$-1].bitSize;
3839         enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
3840         enum roughedMaxIndex =
3841             (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
3842         // check warp around - if wrapped, use the default deduction rule
3843         enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
3844             deduceMaxIndex!(Prefix)() : roughedMaxIndex;
3845     }
3846     else
3847     {
3848         alias Prefix = Args;
3849         enum maxIndex = deduceMaxIndex!(Prefix)();
3850     }
3851 
3852     alias getIndex = mapTrieIndex!(Prefix);
3853 
3854     enum lastLevel = Prefix.length-1;
3855     struct ConstructState
3856     {
3857         size_t idx_zeros, idx_ones;
3858     }
3859     // iteration over levels of Trie, each indexes its own level and thus a shortened domain
3860     size_t[Prefix.length] indices;
3861     // default filler value to use
3862     Value defValue;
3863     // this is a full-width index of next item
3864     size_t curIndex;
3865     // all-zeros page index, all-ones page index (+ indicator if there is such a page)
3866     ConstructState[Prefix.length] state;
3867     // the table being constructed
3868     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
3869 
3870     @disable this();
3871 
3872     //shortcut for index variable at level 'level'
3873     @property ref idx(size_t level)(){ return indices[level]; }
3874 
3875     // this function assumes no holes in the input so
3876     // indices are going one by one
3877     void addValue(size_t level, T)(T val, size_t numVals)
3878     {
3879         alias j = idx!level;
3880         enum pageSize = 1 << Prefix[level].bitSize;
3881         if (numVals == 0)
3882             return;
3883         auto ptr = table.slice!(level);
3884         if (numVals == 1)
3885         {
3886             static if (level == Prefix.length-1)
3887                 ptr[j] = val;
3888             else
3889             {// can incur narrowing conversion
3890                 assert(j < ptr.length);
3891                 ptr[j] = force!(typeof(ptr[j]))(val);
3892             }
3893             j++;
3894             if (j % pageSize == 0)
3895                 spillToNextPage!level(ptr);
3896             return;
3897         }
3898         // longer row of values
3899         // get to the next page boundary
3900         immutable nextPB = (j + pageSize) & ~(pageSize-1);
3901         immutable n =  nextPB - j;// can fill right in this page
3902         if (numVals < n) //fits in current page
3903         {
3904             ptr[j .. j+numVals]  = val;
3905             j += numVals;
3906             return;
3907         }
3908         static if (level != 0)//on the first level it always fits
3909         {
3910             numVals -= n;
3911             //write till the end of current page
3912             ptr[j .. j+n]  = val;
3913             j += n;
3914             //spill to the next page
3915             spillToNextPage!level(ptr);
3916             // page at once loop
3917             if (state[level].idx_zeros != size_t.max && val == T.init)
3918             {
3919                 alias NextIdx = typeof(table.slice!(level-1)[0]);
3920                 addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
3921                     numVals/pageSize);
3922                 ptr = table.slice!level; //table structure might have changed
3923                 numVals %= pageSize;
3924             }
3925             else
3926             {
3927                 while (numVals >= pageSize)
3928                 {
3929                     numVals -= pageSize;
3930                     ptr[j .. j+pageSize]  = val;
3931                     j += pageSize;
3932                     spillToNextPage!level(ptr);
3933                 }
3934             }
3935             if (numVals)
3936             {
3937                 // the leftovers, an incomplete page
3938                 ptr[j .. j+numVals]  = val;
3939                 j += numVals;
3940             }
3941         }
3942     }
3943 
3944     void spillToNextPage(size_t level, Slice)(ref Slice ptr)
3945     {
3946         // last level (i.e. topmost) has 1 "page"
3947         // thus it need not to add a new page on upper level
3948         static if (level != 0)
3949             spillToNextPageImpl!(level)(ptr);
3950     }
3951 
3952     // this can re-use the current page if duplicate or allocate a new one
3953     // it also makes sure that previous levels point to the correct page in this level
3954     void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
3955     {
3956         alias NextIdx = typeof(table.slice!(level-1)[0]);
3957         NextIdx next_lvl_index;
3958         enum pageSize = 1 << Prefix[level].bitSize;
3959         assert(idx!level % pageSize == 0);
3960         immutable last = idx!level-pageSize;
3961         const slice = ptr[idx!level - pageSize .. idx!level];
3962         size_t j;
3963         for (j=0; j<last; j+=pageSize)
3964         {
3965             if (ptr[j .. j+pageSize] == slice)
3966             {
3967                 // get index to it, reuse ptr space for the next block
3968                 next_lvl_index = force!NextIdx(j/pageSize);
3969                 version (none)
3970                 {
3971                 import std.stdio : writefln, writeln;
3972                 writefln("LEVEL(%s) page mapped idx: %s: 0..%s  ---> [%s..%s]"
3973                         ,level
3974                         ,indices[level-1], pageSize, j, j+pageSize);
3975                 writeln("LEVEL(", level
3976                         , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize]));
3977                 writeln("LEVEL(", level
3978                         , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize]));
3979                 }
3980                 idx!level -= pageSize; // reuse this page, it is duplicate
3981                 break;
3982             }
3983         }
3984         if (j == last)
3985         {
3986     L_allocate_page:
3987             next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
3988             if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
3989             {
3990                 state[level].idx_zeros = next_lvl_index;
3991             }
3992             // allocate next page
3993             version (none)
3994             {
3995             import std.stdio : writefln;
3996             writefln("LEVEL(%s) page allocated: %s"
3997                      , level, arrayRepr(slice[0 .. pageSize]));
3998             writefln("LEVEL(%s) index: %s ; page at this index %s"
3999                      , level
4000                      , next_lvl_index
4001                      , arrayRepr(
4002                          table.slice!(level)
4003                           [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
4004                         ));
4005             }
4006             table.length!level = table.length!level + pageSize;
4007         }
4008     L_know_index:
4009         // for the previous level, values are indices to the pages in the current level
4010         addValue!(level-1)(next_lvl_index, 1);
4011         ptr = table.slice!level; //re-load the slice after moves
4012     }
4013 
4014     // idx - full-width index to fill with v (full-width index != key)
4015     // fills everything in the range of [curIndex, idx) with filler
4016     void putAt(size_t idx, Value v)
4017     {
4018         assert(idx >= curIndex);
4019         immutable numFillers = idx - curIndex;
4020         addValue!lastLevel(defValue, numFillers);
4021         addValue!lastLevel(v, 1);
4022         curIndex = idx + 1;
4023     }
4024 
4025     // ditto, but sets the range of [idxA, idxB) to v
4026     void putRangeAt(size_t idxA, size_t idxB, Value v)
4027     {
4028         assert(idxA >= curIndex);
4029         assert(idxB >= idxA);
4030         size_t numFillers = idxA - curIndex;
4031         addValue!lastLevel(defValue, numFillers);
4032         addValue!lastLevel(v, idxB - idxA);
4033         curIndex = idxB; // open-right
4034     }
4035 
4036     enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
4037         "duplicate key->value mapping";
4038 
4039 public:
4040     /**
4041         Construct a builder, where `filler` is a value
4042         to indicate empty slots (or "not found" condition).
4043     */
4044     this(Value filler)
4045     {
4046         curIndex = 0;
4047         defValue = filler;
4048         // zeros-page index, ones-page index
4049         foreach (ref v; state)
4050             v = ConstructState(size_t.max, size_t.max);
4051         table = typeof(table)(indices);
4052         // one page per level is a bootstrap minimum
4053         foreach (i, Pred; Prefix)
4054             table.length!i = (1 << Pred.bitSize);
4055     }
4056 
4057     /**
4058         Put a value `v` into interval as
4059         mapped by keys from `a` to `b`.
4060         All slots prior to `a` are filled with
4061         the default filler.
4062     */
4063     void putRange(Key a, Key b, Value v)
4064     {
4065         auto idxA = getIndex(a), idxB = getIndex(b);
4066         // indexes of key should always grow
4067         enforce(idxB >= idxA && idxA >= curIndex, errMsg);
4068         putRangeAt(idxA, idxB, v);
4069     }
4070 
4071     /**
4072         Put a value `v` into slot mapped by `key`.
4073         All slots prior to `key` are filled with the
4074         default filler.
4075     */
4076     void putValue(Key key, Value v)
4077     {
4078         auto idx = getIndex(key);
4079         enforce(idx >= curIndex, errMsg);
4080         putAt(idx, v);
4081     }
4082 
4083     /// Finishes construction of Trie, yielding an immutable Trie instance.
4084     auto build()
4085     {
4086         static if (maxIndex != 0) // doesn't cover full range of size_t
4087         {
4088             assert(curIndex <= maxIndex);
4089             addValue!lastLevel(defValue, maxIndex - curIndex);
4090         }
4091         else
4092         {
4093             if (curIndex != 0 // couldn't wrap around
4094                 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
4095             {
4096                 addValue!lastLevel(defValue, size_t.max - curIndex);
4097                 addValue!lastLevel(defValue, 1);
4098             }
4099             // else curIndex already completed the full range of size_t by wrapping around
4100         }
4101         return Trie!(V, Key, maxIndex, Prefix)(table);
4102     }
4103 }
4104 
4105 /**
4106     $(P A generic Trie data-structure for a fixed number of stages.
4107     The design goal is optimal speed with smallest footprint size.
4108     )
4109     $(P It's intentionally read-only and doesn't provide constructors.
4110      To construct one use a special builder,
4111      see $(LREF TrieBuilder) and $(LREF buildTrie).
4112     )
4113 
4114 */
4115 @trusted private struct Trie(Value, Key, Args...)
4116 if (isValidPrefixForTrie!(Key, Args)
4117     || (isValidPrefixForTrie!(Key, Args[1..$])
4118     && is(typeof(Args[0]) : size_t)))
4119 {
4120     import std.range.primitives : isOutputRange;
4121     static if (is(typeof(Args[0]) : size_t))
4122     {
4123         private enum maxIndex = Args[0];
4124         private enum hasBoundsCheck = true;
4125         private alias Prefix = Args[1..$];
4126     }
4127     else
4128     {
4129         private enum hasBoundsCheck = false;
4130         private alias Prefix = Args;
4131     }
4132 
4133     private this()(typeof(_table) table)
4134     {
4135         _table = table;
4136     }
4137 
4138     // only for constant Tries constructed from precompiled tables
4139     private this()(const(size_t)[] offsets, const(size_t)[] sizes,
4140         const(size_t)[] data) const
4141     {
4142         _table = typeof(_table)(offsets, sizes, data);
4143     }
4144 
4145     /**
4146         $(P Lookup the `key` in this `Trie`. )
4147 
4148         $(P The lookup always succeeds if key fits the domain
4149         provided during construction. The whole domain defined
4150         is covered so instead of not found condition
4151         the sentinel (filler) value could be used. )
4152 
4153         $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
4154         define a domain of `Trie` keys and the sentinel value. )
4155 
4156         Note:
4157         Domain range-checking is only enabled in debug builds
4158         and results in assertion failure.
4159     */
4160     TypeOfBitPacked!Value opIndex()(Key key) const
4161     {
4162         static if (hasBoundsCheck)
4163             assert(mapTrieIndex!Prefix(key) < maxIndex);
4164         size_t idx;
4165         alias p = Prefix;
4166         idx = cast(size_t) p[0](key);
4167         foreach (i, v; p[0..$-1])
4168             idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
4169         return _table.ptr!(p.length-1)[idx];
4170     }
4171 
4172     ///
4173     @property size_t bytes(size_t n=size_t.max)() const
4174     {
4175         return _table.bytes!n;
4176     }
4177 
4178     ///
4179     @property size_t pages(size_t n)() const
4180     {
4181         return (bytes!n+2^^(Prefix[n].bitSize-1))
4182                 /2^^Prefix[n].bitSize;
4183     }
4184 
4185     ///
4186     void store(OutRange)(scope OutRange sink) const
4187         if (isOutputRange!(OutRange, char))
4188     {
4189         _table.store(sink);
4190     }
4191 
4192 private:
4193     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
4194 }
4195 
4196 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
4197 // left-to-right, the most significant bits first
4198 template GetBitSlicing(size_t top, sizes...)
4199 {
4200     static if (sizes.length > 0)
4201         alias GetBitSlicing =
4202             AliasSeq!(sliceBits!(top - sizes[0], top),
4203                       GetBitSlicing!(top - sizes[0], sizes[1..$]));
4204     else
4205         alias GetBitSlicing = AliasSeq!();
4206 }
4207 
4208 template callableWith(T)
4209 {
4210     template callableWith(alias Pred)
4211     {
4212         static if (!is(typeof(Pred(T.init))))
4213             enum callableWith = false;
4214         else
4215         {
4216             alias Result = typeof(Pred(T.init));
4217             enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
4218         }
4219     }
4220 }
4221 
4222 /*
4223     Check if `Prefix` is a valid set of predicates
4224     for `Trie` template having `Key` as the type of keys.
4225     This requires all predicates to be callable, take
4226     single argument of type `Key` and return unsigned value.
4227 */
4228 template isValidPrefixForTrie(Key, Prefix...)
4229 {
4230     import std.meta : allSatisfy;
4231     enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
4232 }
4233 
4234 /*
4235     Check if `Args` is a set of maximum key value followed by valid predicates
4236     for `Trie` template having `Key` as the type of keys.
4237 */
4238 template isValidArgsForTrie(Key, Args...)
4239 {
4240     static if (Args.length > 1)
4241     {
4242         enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
4243             || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
4244     }
4245     else
4246         enum isValidArgsForTrie = isValidPrefixForTrie!Args;
4247 }
4248 
4249 @property size_t sumOfIntegerTuple(ints...)()
4250 {
4251     size_t count=0;
4252     foreach (v; ints)
4253         count += v;
4254     return count;
4255 }
4256 
4257 /**
4258     A shorthand for creating a custom multi-level fixed Trie
4259     from a `CodepointSet`. `sizes` are numbers of bits per level,
4260     with the most significant bits used first.
4261 
4262     Note: The sum of `sizes` must be equal 21.
4263 
4264     See_Also: $(LREF toTrie), which is even simpler.
4265 
4266     Example:
4267     ---
4268     {
4269         import std.stdio;
4270         auto set = unicode("Number");
4271         auto trie = codepointSetTrie!(8, 5, 8)(set);
4272         writeln("Input code points to test:");
4273         foreach (line; stdin.byLine)
4274         {
4275             int count=0;
4276             foreach (dchar ch; line)
4277                 if (trie[ch])// is number
4278                     count++;
4279             writefln("Contains %d number code points.", count);
4280         }
4281     }
4282     ---
4283 */
4284 public template codepointSetTrie(sizes...)
4285 if (sumOfIntegerTuple!sizes == 21)
4286 {
4287     auto codepointSetTrie(Set)(Set set)
4288         if (isCodepointSet!Set)
4289     {
4290         auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
4291         foreach (ival; set.byInterval)
4292             builder.putRange(ival[0], ival[1], true);
4293         return builder.build();
4294     }
4295 }
4296 
4297 /// Type of Trie generated by codepointSetTrie function.
4298 public template CodepointSetTrie(sizes...)
4299 if (sumOfIntegerTuple!sizes == 21)
4300 {
4301     alias Prefix = GetBitSlicing!(21, sizes);
4302     alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
4303 }
4304 
4305 /**
4306     A slightly more general tool for building fixed `Trie`
4307     for the Unicode data.
4308 
4309     Specifically unlike `codepointSetTrie` it's allows creating mappings
4310     of `dchar` to an arbitrary type `T`.
4311 
4312     Note: Overload taking `CodepointSet`s will naturally convert
4313     only to bool mapping `Trie`s.
4314 
4315     CodepointTrie is the type of Trie as generated by codepointTrie function.
4316 */
4317 public template codepointTrie(T, sizes...)
4318 if (sumOfIntegerTuple!sizes == 21)
4319 {
4320     alias Prefix = GetBitSlicing!(21, sizes);
4321 
4322     static if (is(TypeOfBitPacked!T == bool))
4323     {
4324         auto codepointTrie(Set)(const scope Set set)
4325             if (isCodepointSet!Set)
4326         {
4327             return codepointSetTrie(set);
4328         }
4329     }
4330 
4331     ///
4332     auto codepointTrie()(T[dchar] map, T defValue=T.init)
4333     {
4334         return buildTrie!(T, dchar, Prefix)(map, defValue);
4335     }
4336 
4337     // unsorted range of pairs
4338     ///
4339     auto codepointTrie(R)(R range, T defValue=T.init)
4340         if (isInputRange!R
4341             && is(typeof(ElementType!R.init[0]) : T)
4342             && is(typeof(ElementType!R.init[1]) : dchar))
4343     {
4344         // build from unsorted array of pairs
4345         // TODO: expose index sorting functions for Trie
4346         return buildTrie!(T, dchar, Prefix)(range, defValue, true);
4347     }
4348 }
4349 
4350 @system pure unittest
4351 {
4352     import std.algorithm.comparison : max;
4353     import std.algorithm.searching : count;
4354 
4355     // pick characters from the Greek script
4356     auto set = unicode.Greek;
4357 
4358     // a user-defined property (or an expensive function)
4359     // that we want to look up
4360     static uint luckFactor(dchar ch)
4361     {
4362         // here we consider a character lucky
4363         // if its code point has a lot of identical hex-digits
4364         // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
4365         ubyte[6] nibbles; // 6 4-bit chunks of code point
4366         uint value = ch;
4367         foreach (i; 0 .. 6)
4368         {
4369             nibbles[i] = value & 0xF;
4370             value >>= 4;
4371         }
4372         uint luck;
4373         foreach (n; nibbles)
4374             luck = cast(uint) max(luck, count(nibbles[], n));
4375         return luck;
4376     }
4377 
4378     // only unsigned built-ins are supported at the moment
4379     alias LuckFactor = BitPacked!(uint, 3);
4380 
4381     // create a temporary associative array (AA)
4382     LuckFactor[dchar] map;
4383     foreach (ch; set.byCodepoint)
4384         map[ch] = LuckFactor(luckFactor(ch));
4385 
4386     // bits per stage are chosen randomly, fell free to optimize
4387     auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
4388 
4389     // from now on the AA is not needed
4390     foreach (ch; set.byCodepoint)
4391         assert(trie[ch] == luckFactor(ch)); // verify
4392     // CJK is not Greek, thus it has the default value
4393     assert(trie['\u4444'] == 0);
4394     // and here is a couple of quite lucky Greek characters:
4395     // Greek small letter epsilon with dasia
4396     assert(trie['\u1F11'] == 3);
4397     // Ancient Greek metretes sign
4398     assert(trie['\U00010181'] == 3);
4399 
4400 }
4401 
4402 /// ditto
4403 public template CodepointTrie(T, sizes...)
4404 if (sumOfIntegerTuple!sizes == 21)
4405 {
4406     alias Prefix = GetBitSlicing!(21, sizes);
4407     alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
4408 }
4409 
4410 package(std) template cmpK0(alias Pred)
4411 {
4412     import std.typecons : Tuple;
4413     static bool cmpK0(Value, Key)
4414         (Tuple!(Value, Key) a, Tuple!(Value, Key) b)
4415     {
4416         return Pred(a[1]) < Pred(b[1]);
4417     }
4418 }
4419 
4420 /**
4421     The most general utility for construction of `Trie`s
4422     short of using `TrieBuilder` directly.
4423 
4424     Provides a number of convenience overloads.
4425     `Args` is tuple of maximum key value followed by
4426     predicates to construct index from key.
4427 
4428     Alternatively if the first argument is not a value convertible to `Key`
4429     then the whole tuple of `Args` is treated as predicates
4430     and the maximum Key is deduced from predicates.
4431 */
4432 private template buildTrie(Value, Key, Args...)
4433 if (isValidArgsForTrie!(Key, Args))
4434 {
4435     static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
4436     {
4437         alias Prefix = Args[1..$];
4438     }
4439     else
4440         alias Prefix = Args;
4441 
4442     alias getIndex = mapTrieIndex!(Prefix);
4443 
4444     // for multi-sort
4445     template GetComparators(size_t n)
4446     {
4447         static if (n > 0)
4448             alias GetComparators =
4449                 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
4450         else
4451             alias GetComparators = AliasSeq!();
4452     }
4453 
4454     /*
4455         Build `Trie` from a range of a Key-Value pairs,
4456         assuming it is sorted by Key as defined by the following lambda:
4457         ------
4458         (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
4459         ------
4460         Exception is thrown if it's detected that the above order doesn't hold.
4461 
4462         In other words $(LREF mapTrieIndex) should be a
4463         monotonically increasing function that maps `Key` to an integer.
4464 
4465         See_Also: $(REF sort, std,_algorithm),
4466         $(REF SortedRange, std,range),
4467         $(REF setUnion, std,_algorithm).
4468     */
4469     auto buildTrie(Range)(Range range, Value filler=Value.init)
4470         if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
4471             && is(typeof(Range.init.front[1]) : Key))
4472     {
4473         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4474         foreach (v; range)
4475             builder.putValue(v[1], v[0]);
4476         return builder.build();
4477     }
4478 
4479     /*
4480         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4481         to build `Trie` from a range of open-right intervals of `Key`s.
4482         The requirement  on the ordering of keys (and the behavior on the
4483         violation of it) is the same as for Key-Value range overload.
4484 
4485         Intervals denote ranges of !`filler` i.e. the opposite of filler.
4486         If no filler provided keys inside of the intervals map to true,
4487         and `filler` is false.
4488     */
4489     auto buildTrie(Range)(Range range, Value filler=Value.init)
4490         if (is(TypeOfBitPacked!Value ==  bool)
4491             && isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
4492             && is(typeof(Range.init.front[1]) : Key))
4493     {
4494         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4495         foreach (ival; range)
4496             builder.putRange(ival[0], ival[1], !filler);
4497         return builder.build();
4498     }
4499 
4500     auto buildTrie(Range)(Range range, Value filler, bool unsorted)
4501         if (isInputRange!Range
4502             && is(typeof(Range.init.front[0]) : Value)
4503             && is(typeof(Range.init.front[1]) : Key))
4504     {
4505         import std.algorithm.sorting : multiSort;
4506         alias Comps = GetComparators!(Prefix.length);
4507         if (unsorted)
4508             multiSort!(Comps)(range);
4509         return buildTrie(range, filler);
4510     }
4511 
4512     /*
4513         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4514         to build `Trie` simply from an input range of `Key`s.
4515         The requirement  on the ordering of keys (and the behavior on the
4516         violation of it) is the same as for Key-Value range overload.
4517 
4518         Keys found in range denote !`filler` i.e. the opposite of filler.
4519         If no filler provided keys map to true, and `filler` is false.
4520     */
4521     auto buildTrie(Range)(Range range, Value filler=Value.init)
4522         if (is(TypeOfBitPacked!Value ==  bool)
4523             && isInputRange!Range && is(typeof(Range.init.front) : Key))
4524     {
4525         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4526         foreach (v; range)
4527             builder.putValue(v, !filler);
4528         return builder.build();
4529     }
4530 
4531     /*
4532         If `Key` is unsigned integer `Trie` could be constructed from array
4533         of values where array index serves as key.
4534     */
4535     auto buildTrie()(Value[] array, Value filler=Value.init)
4536         if (isUnsigned!Key)
4537     {
4538         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4539         foreach (idx, v; array)
4540             builder.putValue(idx, v);
4541         return builder.build();
4542     }
4543 
4544     /*
4545         Builds `Trie` from associative array.
4546     */
4547     auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
4548     {
4549         import std.array : array;
4550         import std.range : zip;
4551         auto range = array(zip(map.values, map.keys));
4552         return buildTrie(range, filler, true); // sort it
4553     }
4554 }
4555 
4556 // helper in place of assumeSize to
4557 //reduce mangled name & help DMD inline Trie functors
4558 struct clamp(size_t bits)
4559 {
4560     static size_t opCall(T)(T arg){ return arg; }
4561     enum bitSize = bits;
4562 }
4563 
4564 struct clampIdx(size_t idx, size_t bits)
4565 {
4566     static size_t opCall(T)(T arg){ return arg[idx]; }
4567     enum bitSize = bits;
4568 }
4569 
4570 /**
4571     Conceptual type that outlines the common properties of all UTF Matchers.
4572 
4573     Note: For illustration purposes only, every method
4574     call results in assertion failure.
4575     Use $(LREF utfMatcher) to obtain a concrete matcher
4576     for UTF-8 or UTF-16 encodings.
4577 */
4578 public struct MatcherConcept
4579 {
4580     /**
4581         $(P Perform a semantic equivalent 2 operations:
4582         decoding a $(CODEPOINT) at front of `inp` and testing if
4583         it belongs to the set of $(CODEPOINTS) of this matcher. )
4584 
4585         $(P The effect on `inp` depends on the kind of function called:)
4586 
4587         $(P Match. If the codepoint is found in the set then range `inp`
4588         is advanced by its size in $(S_LINK Code unit, code units),
4589         otherwise the range is not modifed.)
4590 
4591         $(P Skip. The range is always advanced by the size
4592         of the tested $(CODEPOINT) regardless of the result of test.)
4593 
4594         $(P Test. The range is left unaffected regardless
4595         of the result of test.)
4596     */
4597     public bool match(Range)(ref Range inp)
4598         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4599     {
4600        assert(false);
4601     }
4602 
4603     ///ditto
4604     public bool skip(Range)(ref Range inp)
4605         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4606     {
4607         assert(false);
4608     }
4609 
4610     ///ditto
4611     public bool test(Range)(ref Range inp)
4612         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4613     {
4614         assert(false);
4615     }
4616     ///
4617     pure @safe unittest
4618     {
4619         string truth = "2² = 4";
4620         auto m = utfMatcher!char(unicode.Number);
4621         assert(m.match(truth)); // '2' is a number all right
4622         assert(truth == "² = 4"); // skips on match
4623         assert(m.match(truth)); // so is the superscript '2'
4624         assert(!m.match(truth)); // space is not a number
4625         assert(truth == " = 4"); // unaffected on no match
4626         assert(!m.skip(truth)); // same test ...
4627         assert(truth == "= 4"); // but skips a codepoint regardless
4628         assert(!m.test(truth)); // '=' is not a number
4629         assert(truth == "= 4"); // test never affects argument
4630     }
4631 
4632     /**
4633         Advanced feature - provide direct access to a subset of matcher based a
4634         set of known encoding lengths. Lengths are provided in
4635         $(S_LINK Code unit, code units). The sub-matcher then may do less
4636         operations per any `test`/`match`.
4637 
4638         Use with care as the sub-matcher won't match
4639         any $(CODEPOINTS) that have encoded length that doesn't belong
4640         to the selected set of lengths. Also the sub-matcher object references
4641         the parent matcher and must not be used past the liftetime
4642         of the latter.
4643 
4644         Another caveat of using sub-matcher is that skip is not available
4645         preciesly because sub-matcher doesn't detect all lengths.
4646     */
4647     @property auto subMatcher(Lengths...)()
4648     {
4649         assert(0);
4650         return this;
4651     }
4652 
4653     pure @safe unittest
4654     {
4655         auto m = utfMatcher!char(unicode.Number);
4656         string square = "2²";
4657         // about sub-matchers
4658         assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
4659         assert(m.subMatcher!1.match(square)); // ASCII-only, works
4660         assert(!m.subMatcher!1.test(square)); // unicode '²'
4661         assert(m.subMatcher!(2,3,4).match(square));  //
4662         assert(square == "");
4663         wstring wsquare = "2²";
4664         auto m16 = utfMatcher!wchar(unicode.Number);
4665         // may keep ref, but the orignal (m16) must be kept alive
4666         auto bmp = m16.subMatcher!1;
4667         assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
4668         assert(bmp.match(wsquare)); // And '²' too
4669     }
4670 }
4671 
4672 /**
4673     Test if `M` is an UTF Matcher for ranges of `Char`.
4674 */
4675 public enum isUtfMatcher(M, C) = __traits(compiles, (){
4676     C[] s;
4677     auto d = s.decoder;
4678     M m;
4679     assert(is(typeof(m.match(d)) == bool));
4680     assert(is(typeof(m.test(d)) == bool));
4681     static if (is(typeof(m.skip(d))))
4682     {
4683         assert(is(typeof(m.skip(d)) == bool));
4684         assert(is(typeof(m.skip(s)) == bool));
4685     }
4686     assert(is(typeof(m.match(s)) == bool));
4687     assert(is(typeof(m.test(s)) == bool));
4688 });
4689 
4690 pure @safe unittest
4691 {
4692     alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
4693     alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
4694     static assert(isUtfMatcher!(CharMatcher, char));
4695     static assert(isUtfMatcher!(CharMatcher, immutable(char)));
4696     static assert(isUtfMatcher!(WcharMatcher, wchar));
4697     static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
4698 }
4699 
4700 enum Mode {
4701     alwaysSkip,
4702     neverSkip,
4703     skipOnMatch
4704 }
4705 
4706 mixin template ForwardStrings()
4707 {
4708     private bool fwdStr(string fn, C)(ref C[] str) const @trusted
4709     {
4710         import std.utf : byCodeUnit;
4711         alias type = typeof(byCodeUnit(str));
4712         return mixin(fn~"(*cast(type*)&str)");
4713     }
4714 }
4715 
4716 template Utf8Matcher()
4717 {
4718     enum validSize(int sz) = sz >= 1 && sz <= 4;
4719 
4720     void badEncoding() pure @safe
4721     {
4722         import std.utf : UTFException;
4723         throw new UTFException("Invalid UTF-8 sequence");
4724     }
4725 
4726     //for 1-stage ASCII
4727     alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
4728     //for 2-stage lookup of 2 byte UTF-8 sequences
4729     alias Utf8Spec2 = AliasSeq!(bool, char[2],
4730         clampIdx!(0, 5), clampIdx!(1, 6));
4731     //ditto for 3 byte
4732     alias Utf8Spec3 = AliasSeq!(bool, char[3],
4733         clampIdx!(0, 4),
4734         clampIdx!(1, 6),
4735         clampIdx!(2, 6)
4736     );
4737     //ditto for 4 byte
4738     alias Utf8Spec4 = AliasSeq!(bool, char[4],
4739         clampIdx!(0, 3), clampIdx!(1, 6),
4740         clampIdx!(2, 6), clampIdx!(3, 6)
4741     );
4742     alias Tables = AliasSeq!(
4743         typeof(TrieBuilder!(AsciiSpec)(false).build()),
4744         typeof(TrieBuilder!(Utf8Spec2)(false).build()),
4745         typeof(TrieBuilder!(Utf8Spec3)(false).build()),
4746         typeof(TrieBuilder!(Utf8Spec4)(false).build())
4747     );
4748     alias Table(int size) = Tables[size-1];
4749 
4750     enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
4751     enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
4752 
4753     char truncate()(char ch) pure @safe
4754     {
4755         ch -= 0x80;
4756         if (ch < 0x40)
4757         {
4758             return ch;
4759         }
4760         else
4761         {
4762             badEncoding();
4763             return cast(char) 0;
4764         }
4765     }
4766 
4767     static auto encode(size_t sz)(dchar ch)
4768         if (sz > 1)
4769     {
4770         import std.utf : encodeUTF = encode;
4771         char[4] buf;
4772         encodeUTF(buf, ch);
4773         char[sz] ret;
4774         buf[0] &= leadMask!sz;
4775         foreach (n; 1 .. sz)
4776             buf[n] = buf[n] & 0x3f; //keep 6 lower bits
4777         ret[] = buf[0 .. sz];
4778         return ret;
4779     }
4780 
4781     auto build(Set)(Set set)
4782     {
4783         import std.algorithm.iteration : map;
4784         auto ascii = set & unicode.ASCII;
4785         auto utf8_2 = set & CodepointSet(0x80, 0x800);
4786         auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
4787         auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
4788         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
4789         auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
4790         auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
4791         auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
4792         alias Ret = Impl!(1,2,3,4);
4793         return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
4794     }
4795 
4796     // Bootstrap UTF-8 static matcher interface
4797     // from 3 primitives: tab!(size), lookup and Sizes
4798     mixin template DefMatcher()
4799     {
4800         import std.format : format;
4801         import std.meta : Erase, staticIndexOf;
4802         enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
4803         alias UniSizes = Erase!(1, Sizes);
4804 
4805         //generate dispatch code sequence for unicode parts
4806         static auto genDispatch()
4807         {
4808             string code;
4809             foreach (size; UniSizes)
4810                 code ~= format(q{
4811                     if ((ch & ~leadMask!%d) == encMask!(%d))
4812                         return lookup!(%d, mode)(inp);
4813                     else
4814                 }, size, size, size);
4815             static if (Sizes.length == 4) //covers all code unit cases
4816                 code ~= "{ badEncoding(); return false; }";
4817             else
4818                 code ~= "return false;"; //may be just fine but not covered
4819             return code;
4820         }
4821         enum dispatch = genDispatch();
4822 
4823         public bool match(Range)(ref Range inp) const
4824             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4825                 !isDynamicArray!Range)
4826         {
4827             enum mode = Mode.skipOnMatch;
4828             assert(!inp.empty);
4829             immutable ch = inp[0];
4830             static if (hasASCII)
4831             {
4832                 if (ch < 0x80)
4833                 {
4834                     immutable r = tab!1[ch];
4835                     if (r)
4836                         inp.popFront();
4837                     return r;
4838                 }
4839                 else
4840                     mixin(dispatch);
4841             }
4842             else
4843                 mixin(dispatch);
4844         }
4845 
4846         static if (Sizes.length == 4) // can skip iff can detect all encodings
4847         {
4848             public bool skip(Range)(ref Range inp) const
4849                 if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4850                     !isDynamicArray!Range)
4851             {
4852                 enum mode = Mode.alwaysSkip;
4853                 assert(!inp.empty);
4854                 auto ch = inp[0];
4855                 static if (hasASCII)
4856                 {
4857                     if (ch < 0x80)
4858                     {
4859                         inp.popFront();
4860                         return tab!1[ch];
4861                     }
4862                     else
4863                         mixin(dispatch);
4864                 }
4865                 else
4866                     mixin(dispatch);
4867             }
4868         }
4869 
4870         public bool test(Range)(ref Range inp) const
4871             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4872                 !isDynamicArray!Range)
4873         {
4874             enum mode = Mode.neverSkip;
4875             assert(!inp.empty);
4876             auto ch = inp[0];
4877 
4878             static if (hasASCII)
4879             {
4880                 if (ch < 0x80)
4881                     return tab!1[ch];
4882                 else
4883                     mixin(dispatch);
4884             }
4885             else
4886                 mixin(dispatch);
4887         }
4888 
4889         bool match(C)(ref C[] str) const
4890             if (isSomeChar!C)
4891         {
4892             return fwdStr!"match"(str);
4893         }
4894 
4895         bool skip(C)(ref C[] str) const
4896             if (isSomeChar!C)
4897         {
4898             return fwdStr!"skip"(str);
4899         }
4900 
4901         bool test(C)(ref C[] str) const
4902             if (isSomeChar!C)
4903         {
4904             return fwdStr!"test"(str);
4905         }
4906 
4907         mixin ForwardStrings;
4908     }
4909 
4910     struct Impl(Sizes...)
4911     {
4912         import std.meta : allSatisfy, staticMap;
4913         static assert(allSatisfy!(validSize, Sizes),
4914             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4915     private:
4916         //pick tables for chosen sizes
4917         alias OurTabs = staticMap!(Table, Sizes);
4918         OurTabs tables;
4919         mixin DefMatcher;
4920         //static disptach helper UTF size ==> table
4921         alias tab(int i) = tables[i - 1];
4922 
4923         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
4924         {
4925             return CherryPick!(Impl, SizesToPick)(&this);
4926         }
4927 
4928         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4929         {
4930             import std.range : popFrontN;
4931             if (inp.length < size)
4932             {
4933                 badEncoding();
4934                 return false;
4935             }
4936             char[size] needle = void;
4937             needle[0] = leadMask!size & inp[0];
4938             static foreach (i; 1 .. size)
4939             {
4940                 needle[i] = truncate(inp[i]);
4941             }
4942             //overlong encoding checks
4943             static if (size == 2)
4944             {
4945                 //0x80-0x7FF
4946                 //got 6 bits in needle[1], must use at least 8 bits
4947                 //must use at least 2 bits in needle[1]
4948                 if (needle[0] < 2) badEncoding();
4949             }
4950             else static if (size == 3)
4951             {
4952                 //0x800-0xFFFF
4953                 //got 6 bits in needle[2], must use at least 12bits
4954                 //must use 6 bits in needle[1] or anything in needle[0]
4955                 if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
4956             }
4957             else static if (size == 4)
4958             {
4959                 //0x800-0xFFFF
4960                 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
4961                 //must use 5 bits (or above) in needle[1] or anything in needle[0]
4962                 if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
4963             }
4964             static if (mode == Mode.alwaysSkip)
4965             {
4966                 inp.popFrontN(size);
4967                 return tab!size[needle];
4968             }
4969             else static if (mode == Mode.neverSkip)
4970             {
4971                 return tab!size[needle];
4972             }
4973             else
4974             {
4975                 static assert(mode == Mode.skipOnMatch);
4976 
4977                 if (tab!size[needle])
4978                 {
4979                     inp.popFrontN(size);
4980                     return true;
4981                 }
4982                 else
4983                     return false;
4984             }
4985         }
4986     }
4987 
4988     struct CherryPick(I, Sizes...)
4989     {
4990         import std.meta : allSatisfy;
4991         static assert(allSatisfy!(validSize, Sizes),
4992             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4993     private:
4994         I* m;
4995         @property auto tab(int i)() const { return m.tables[i - 1]; }
4996         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4997         {
4998             return m.lookup!(size, mode)(inp);
4999         }
5000         mixin DefMatcher;
5001     }
5002 }
5003 
5004 template Utf16Matcher()
5005 {
5006     enum validSize(int sz) = sz >= 1 && sz <= 2;
5007 
5008     void badEncoding() pure @safe
5009     {
5010         import std.utf : UTFException;
5011         throw new UTFException("Invalid UTF-16 sequence");
5012     }
5013 
5014     // 1-stage ASCII
5015     alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
5016     //2-stage BMP
5017     alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
5018     //4-stage - full Unicode
5019     //assume that 0xD800 & 0xDC00 bits are cleared
5020     //thus leaving 10 bit per wchar to worry about
5021     alias UniSpec = AliasSeq!(bool, wchar[2],
5022         assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
5023         assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
5024     );
5025     alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
5026     alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
5027     alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
5028 
5029     auto encode2(dchar ch)
5030     {
5031         ch -= 0x1_0000;
5032         assert(ch <= 0xF_FFFF);
5033         wchar[2] ret;
5034         //do not put surrogate bits, they are sliced off
5035         ret[0] = cast(wchar)(ch >> 10);
5036         ret[1] = (ch & 0xFFF);
5037         return ret;
5038     }
5039 
5040     auto build(Set)(Set set)
5041     {
5042         import std.algorithm.iteration : map;
5043         auto ascii = set & unicode.ASCII;
5044         auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
5045             - CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
5046         auto other = set - (bmp | ascii);
5047         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
5048         auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
5049         auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
5050         alias Ret = Impl!(1,2);
5051         return Ret(asciiT, bmpT, otherT);
5052     }
5053 
5054     //bootstrap full UTF-16 matcher interace from
5055     //sizeFlags, lookupUni and ascii
5056     mixin template DefMatcher()
5057     {
5058         public bool match(Range)(ref Range inp) const
5059             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5060                 !isDynamicArray!Range)
5061         {
5062             enum mode = Mode.skipOnMatch;
5063             assert(!inp.empty);
5064             immutable ch = inp[0];
5065             static if (sizeFlags & 1)
5066             {
5067                 if (ch < 0x80)
5068                 {
5069                   if (ascii[ch])
5070                   {
5071                       inp.popFront();
5072                       return true;
5073                   }
5074                   else
5075                       return false;
5076                 }
5077                 return lookupUni!mode(inp);
5078             }
5079             else
5080                 return lookupUni!mode(inp);
5081         }
5082 
5083         static if (Sizes.length == 2)
5084         {
5085             public bool skip(Range)(ref Range inp) const
5086                 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5087                     !isDynamicArray!Range)
5088             {
5089                 enum mode = Mode.alwaysSkip;
5090                 assert(!inp.empty);
5091                 immutable ch = inp[0];
5092                 static if (sizeFlags & 1)
5093                 {
5094                     if (ch < 0x80)
5095                     {
5096                         inp.popFront();
5097                         return ascii[ch];
5098                     }
5099                     else
5100                         return lookupUni!mode(inp);
5101                 }
5102                 else
5103                     return lookupUni!mode(inp);
5104             }
5105         }
5106 
5107         public bool test(Range)(ref Range inp) const
5108             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5109                 !isDynamicArray!Range)
5110         {
5111             enum mode = Mode.neverSkip;
5112             assert(!inp.empty);
5113             auto ch = inp[0];
5114             static if (sizeFlags & 1)
5115                 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
5116             else
5117                 return lookupUni!mode(inp);
5118         }
5119 
5120         bool match(C)(ref C[] str) const
5121             if (isSomeChar!C)
5122         {
5123             return fwdStr!"match"(str);
5124         }
5125 
5126         bool skip(C)(ref C[] str) const
5127             if (isSomeChar!C)
5128         {
5129             return fwdStr!"skip"(str);
5130         }
5131 
5132         bool test(C)(ref C[] str) const
5133             if (isSomeChar!C)
5134         {
5135             return fwdStr!"test"(str);
5136         }
5137 
5138         mixin ForwardStrings; //dispatch strings to range versions
5139     }
5140 
5141     struct Impl(Sizes...)
5142         if (Sizes.length >= 1 && Sizes.length <= 2)
5143     {
5144     private:
5145         import std.meta : allSatisfy;
5146         static assert(allSatisfy!(validSize, Sizes),
5147             "Only lengths of 1 and 2 code units are possible in UTF-16");
5148         static if (Sizes.length > 1)
5149             enum sizeFlags = Sizes[0] | Sizes[1];
5150         else
5151             enum sizeFlags = Sizes[0];
5152 
5153         static if (sizeFlags & 1)
5154         {
5155             Ascii ascii;
5156             Bmp bmp;
5157         }
5158         static if (sizeFlags & 2)
5159         {
5160             Uni uni;
5161         }
5162         mixin DefMatcher;
5163 
5164         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
5165         {
5166             return CherryPick!(Impl, SizesToPick)(&this);
5167         }
5168 
5169         bool lookupUni(Mode mode, Range)(ref Range inp) const
5170         {
5171             wchar x = cast(wchar)(inp[0] - 0xD800);
5172             //not a high surrogate
5173             if (x > 0x3FF)
5174             {
5175                 //low surrogate
5176                 if (x <= 0x7FF) badEncoding();
5177                 static if (sizeFlags & 1)
5178                 {
5179                     auto ch = inp[0];
5180                     static if (mode == Mode.alwaysSkip)
5181                         inp.popFront();
5182                     static if (mode == Mode.skipOnMatch)
5183                     {
5184                         if (bmp[ch])
5185                         {
5186                             inp.popFront();
5187                             return true;
5188                         }
5189                         else
5190                             return false;
5191                     }
5192                     else
5193                         return bmp[ch];
5194                 }
5195                 else //skip is not available for sub-matchers, so just false
5196                     return false;
5197             }
5198             else
5199             {
5200                 import std.range : popFrontN;
5201                 static if (sizeFlags & 2)
5202                 {
5203                     if (inp.length < 2)
5204                         badEncoding();
5205                     wchar y = cast(wchar)(inp[1] - 0xDC00);
5206                     //not a low surrogate
5207                     if (y > 0x3FF)
5208                         badEncoding();
5209                     wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
5210                     static if (mode == Mode.alwaysSkip)
5211                         inp.popFrontN(2);
5212                     static if (mode == Mode.skipOnMatch)
5213                     {
5214                         if (uni[needle])
5215                         {
5216                             inp.popFrontN(2);
5217                             return true;
5218                         }
5219                         else
5220                             return false;
5221                     }
5222                     else
5223                         return uni[needle];
5224                 }
5225                 else //ditto
5226                     return false;
5227             }
5228         }
5229     }
5230 
5231     struct CherryPick(I, Sizes...)
5232         if (Sizes.length >= 1 && Sizes.length <= 2)
5233     {
5234     private:
5235         import std.meta : allSatisfy;
5236         I* m;
5237         enum sizeFlags = I.sizeFlags;
5238 
5239         static if (sizeFlags & 1)
5240         {
5241             @property auto ascii()() const { return m.ascii; }
5242         }
5243 
5244         bool lookupUni(Mode mode, Range)(ref Range inp) const
5245         {
5246             return m.lookupUni!mode(inp);
5247         }
5248         mixin DefMatcher;
5249         static assert(allSatisfy!(validSize, Sizes),
5250             "Only lengths of 1 and 2 code units are possible in UTF-16");
5251     }
5252 }
5253 
5254 private auto utf8Matcher(Set)(Set set)
5255 {
5256     return Utf8Matcher!().build(set);
5257 }
5258 
5259 private auto utf16Matcher(Set)(Set set)
5260 {
5261     return Utf16Matcher!().build(set);
5262 }
5263 
5264 /**
5265     Constructs a matcher object
5266     to classify $(CODEPOINTS) from the `set` for encoding
5267     that has `Char` as code unit.
5268 
5269     See $(LREF MatcherConcept) for API outline.
5270 */
5271 public auto utfMatcher(Char, Set)(Set set)
5272 if (isCodepointSet!Set)
5273 {
5274     static if (is(Char : char))
5275         return utf8Matcher(set);
5276     else static if (is(Char : wchar))
5277         return utf16Matcher(set);
5278     else static if (is(Char : dchar))
5279         static assert(false, "UTF-32 needs no decoding,
5280             and thus not supported by utfMatcher");
5281     else
5282         static assert(false, "Only character types 'char' and 'wchar' are allowed");
5283 }
5284 
5285 
5286 //a range of code units, packed with index to speed up forward iteration
5287 package(std) auto decoder(C)(C[] s, size_t offset=0)
5288 if (is(C : wchar) || is(C : char))
5289 {
5290     static struct Decoder
5291     {
5292     pure nothrow:
5293         C[] str;
5294         size_t idx;
5295         @property C front(){ return str[idx]; }
5296         @property C back(){ return str[$-1]; }
5297         void popFront(){ idx++; }
5298         void popBack(){ str = str[0..$-1]; }
5299         void popFrontN(size_t n){ idx += n; }
5300         @property bool empty(){ return idx == str.length; }
5301         @property auto save(){ return this; }
5302         auto opIndex(size_t i){ return str[idx+i]; }
5303         @property size_t length(){ return str.length - idx; }
5304         alias opDollar = length;
5305         auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
5306     }
5307     static assert(isRandomAccessRange!Decoder);
5308     static assert(is(ElementType!Decoder : C));
5309     return Decoder(s, offset);
5310 }
5311 
5312 pure @safe unittest
5313 {
5314     string rs = "hi! ネемног砀 текста";
5315     auto codec = rs.decoder;
5316     auto utf8 =  utf8Matcher(unicode.Letter);
5317     auto asc = utf8.subMatcher!(1);
5318     auto uni = utf8.subMatcher!(2,3,4);
5319 
5320     // h
5321     assert(asc.test(codec));
5322     assert(!uni.match(codec));
5323     assert(utf8.skip(codec));
5324     assert(codec.idx == 1);
5325 
5326     // i
5327     assert(asc.test(codec));
5328     assert(!uni.match(codec));
5329     assert(utf8.skip(codec));
5330     assert(codec.idx == 2);
5331 
5332     // !
5333     assert(!asc.match(codec));
5334     assert(!utf8.test(codec));
5335     assert(!utf8.skip(codec));
5336     assert(codec.idx == 3);
5337 
5338     // space
5339     assert(!asc.test(codec));
5340     assert(!utf8.test(codec));
5341     assert(!utf8.skip(codec));
5342     assert(codec.idx == 4);
5343 
5344     assert(utf8.test(codec));
5345     foreach (i; 0 .. 7)
5346     {
5347         assert(!asc.test(codec));
5348         assert(uni.test(codec));
5349         assert(utf8.skip(codec));
5350     }
5351     assert(!utf8.test(codec));
5352     assert(!utf8.skip(codec));
5353 
5354     //the same with match where applicable
5355     codec = rs.decoder;
5356     assert(utf8.match(codec));
5357     assert(codec.idx == 1);
5358     assert(utf8.match(codec));
5359     assert(codec.idx == 2);
5360     assert(!utf8.match(codec));
5361     assert(codec.idx == 2);
5362     assert(!utf8.skip(codec));
5363     assert(!utf8.skip(codec));
5364 
5365     foreach (i; 0 .. 7)
5366     {
5367         assert(!asc.test(codec));
5368         assert(utf8.test(codec));
5369         assert(utf8.match(codec));
5370     }
5371     auto i = codec.idx;
5372     assert(!utf8.match(codec));
5373     assert(codec.idx == i);
5374 }
5375 
5376 pure @system unittest
5377 {
5378     import std.range : stride;
5379     static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe
5380     {
5381         bool t = m.test(r);
5382         auto save = r.idx;
5383         assert(t == m.match(r));
5384         assert(r.idx == save || t); //ether no change or was match
5385         r.idx = save;
5386         static if (is(typeof(m.skip(r))))
5387         {
5388             assert(t == m.skip(r));
5389             assert(r.idx != save); //always changed
5390             r.idx = save;
5391         }
5392         return t;
5393     }
5394     auto utf16 = utfMatcher!wchar(unicode.L);
5395     auto bmp = utf16.subMatcher!1;
5396     auto nonBmp = utf16.subMatcher!1;
5397     auto utf8 = utfMatcher!char(unicode.L);
5398     auto ascii = utf8.subMatcher!1;
5399     auto uni2 = utf8.subMatcher!2;
5400     auto uni3 = utf8.subMatcher!3;
5401     auto uni24 = utf8.subMatcher!(2,4);
5402     foreach (ch; unicode.L.byCodepoint.stride(3))
5403     {
5404         import std.utf : encode;
5405         char[4] buf;
5406         wchar[2] buf16;
5407         auto len = encode(buf, ch);
5408         auto len16 = encode(buf16, ch);
5409         auto c8 = buf[0 .. len].decoder;
5410         auto c16 = buf16[0 .. len16].decoder;
5411         assert(testAll(utf16, c16));
5412         assert(testAll(bmp, c16) || len16 != 1);
5413         assert(testAll(nonBmp, c16) || len16 != 2);
5414 
5415         assert(testAll(utf8, c8));
5416 
5417         //submatchers return false on out of their domain
5418         assert(testAll(ascii, c8) || len != 1);
5419         assert(testAll(uni2, c8) || len != 2);
5420         assert(testAll(uni3, c8) || len != 3);
5421         assert(testAll(uni24, c8) || (len != 2 && len != 4));
5422     }
5423 }
5424 
5425 // cover decode fail cases of Matcher
5426 pure @safe unittest
5427 {
5428     import std.algorithm.iteration : map;
5429     import std.exception : collectException;
5430     import std.format : format;
5431     auto utf16 = utfMatcher!wchar(unicode.L);
5432     auto utf8 = utfMatcher!char(unicode.L);
5433     //decode failure cases UTF-8
5434     alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
5435         "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
5436         "\xCF\x00\0x00\0x00\x00");
5437     foreach (msg; fails8)
5438     {
5439         assert(collectException((){
5440             auto s = msg;
5441             size_t idx = 0;
5442             utf8.test(s);
5443         }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg));
5444     }
5445     //decode failure cases UTF-16
5446     alias fails16 = AliasSeq!([0xD811], [0xDC02]);
5447     foreach (msg; fails16)
5448     {
5449         assert(collectException((){
5450             auto s = msg.map!(x => cast(wchar) x);
5451             utf16.test(s);
5452         }()));
5453     }
5454 }
5455 
5456 /++
5457     Convenience function to construct optimal configurations for
5458     packed Trie from any `set` of $(CODEPOINTS).
5459 
5460     The parameter `level` indicates the number of trie levels to use,
5461     allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
5462     speed-size wise.
5463 
5464     $(P Level 1 is fastest and the most memory hungry (a bit array). )
5465     $(P Level 4 is the slowest and has the smallest footprint. )
5466 
5467     See the $(S_LINK Synopsis, Synopsis) section for example.
5468 
5469     Note:
5470     Level 4 stays very practical (being faster and more predictable)
5471     compared to using direct lookup on the `set` itself.
5472 
5473 
5474 +/
5475 public auto toTrie(size_t level, Set)(Set set)
5476 if (isCodepointSet!Set)
5477 {
5478     static if (level == 1)
5479         return codepointSetTrie!(21)(set);
5480     else static if (level == 2)
5481         return codepointSetTrie!(10, 11)(set);
5482     else static if (level == 3)
5483         return codepointSetTrie!(8, 5, 8)(set);
5484     else static if (level == 4)
5485          return codepointSetTrie!(6, 4, 4, 7)(set);
5486     else
5487         static assert(false,
5488             "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
5489 }
5490 
5491 /**
5492     $(P Builds a `Trie` with typically optimal speed-size trade-off
5493     and wraps it into a delegate of the following type:
5494     $(D bool delegate(dchar ch)). )
5495 
5496     $(P Effectively this creates a 'tester' lambda suitable
5497     for algorithms like std.algorithm.find that take unary predicates. )
5498 
5499     See the $(S_LINK Synopsis, Synopsis) section for example.
5500 */
5501 public auto toDelegate(Set)(Set set)
5502 if (isCodepointSet!Set)
5503 {
5504     // 3 is very small and is almost as fast as 2-level (due to CPU caches?)
5505     auto t = toTrie!3(set);
5506     return (dchar ch) => t[ch];
5507 }
5508 
5509 /**
5510     $(P Opaque wrapper around unsigned built-in integers and
5511     code unit (char/wchar/dchar) types.
5512     Parameter `sz` indicates that the value is confined
5513     to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
5514     packed more tightly when stored in certain
5515     data-structures like trie. )
5516 
5517     Note:
5518     $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T`
5519     but not vise-versa. Users have to ensure the value fits in
5520     the range required and use the `cast`
5521     operator to perform the conversion.)
5522 */
5523 struct BitPacked(T, size_t sz)
5524 if (isIntegral!T || is(T:dchar))
5525 {
5526     enum bitSize = sz;
5527     T _value;
5528     alias _value this;
5529 }
5530 
5531 /*
5532     Depending on the form of the passed argument `bitSizeOf` returns
5533     the amount of bits required to represent a given type
5534     or a return type of a given functor.
5535 */
5536 template bitSizeOf(Args...)
5537 if (Args.length == 1)
5538 {
5539     import std.traits : ReturnType;
5540     alias T = Args[0];
5541     static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
5542     {
5543         enum bitSizeOf = T.bitSize;
5544     }
5545     else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
5546     {
5547         enum bitSizeOf = bitSizeOf!(ReturnType!T);
5548     }
5549     else
5550     {
5551         enum bitSizeOf = T.sizeof*8;
5552     }
5553 }
5554 
5555 /**
5556     Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x)
5557     and thus suitable for packing.
5558 */
5559 template isBitPacked(T)
5560 {
5561     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5562         enum isBitPacked = true;
5563     else
5564         enum isBitPacked = false;
5565 }
5566 
5567 /**
5568     Gives the type `U` from $(LREF BitPacked)!(U, x)
5569     or `T` itself for every other type.
5570 */
5571 template TypeOfBitPacked(T)
5572 {
5573     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5574         alias TypeOfBitPacked = U;
5575     else
5576         alias TypeOfBitPacked = T;
5577 }
5578 
5579 /*
5580     Wrapper, used in definition of custom data structures from `Trie` template.
5581     Applying it to a unary lambda function indicates that the returned value always
5582     fits within `bits` of bits.
5583 */
5584 struct assumeSize(alias Fn, size_t bits)
5585 {
5586     enum bitSize = bits;
5587     static auto ref opCall(T)(auto ref T arg)
5588     {
5589         return Fn(arg);
5590     }
5591 }
5592 
5593 /*
5594     A helper for defining lambda function that yields a slice
5595     of certain bits from an unsigned integral value.
5596     The resulting lambda is wrapped in assumeSize and can be used directly
5597     with `Trie` template.
5598 */
5599 struct sliceBits(size_t from, size_t to)
5600 {
5601     //for now bypass assumeSize, DMD has trouble inlining it
5602     enum bitSize = to-from;
5603     static auto opCall(T)(T x)
5604     out(result)
5605     {
5606         assert(result < (1 << to-from));
5607     }
5608     do
5609     {
5610         static assert(from < to);
5611         static if (from == 0)
5612             return x & ((1 << to)-1);
5613         else
5614         return (x >> from) & ((1<<(to-from))-1);
5615     }
5616 }
5617 
5618 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
5619 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
5620 alias lo8 = assumeSize!(low_8, 8);
5621 alias mlo8 = assumeSize!(midlow_8, 8);
5622 
5623 @safe pure nothrow @nogc unittest
5624 {
5625     static assert(bitSizeOf!lo8 == 8);
5626     static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
5627     static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
5628 }
5629 
5630 template Sequence(size_t start, size_t end)
5631 {
5632     static if (start < end)
5633         alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
5634     else
5635         alias Sequence = AliasSeq!();
5636 }
5637 
5638 //---- TRIE TESTS ----
5639 @system unittest
5640 {
5641     import std.algorithm.iteration : map;
5642     import std.algorithm.sorting : sort;
5643     import std.array : array;
5644     import std.conv : text, to;
5645     import std.range : iota;
5646     static trieStats(TRIE)(TRIE t)
5647     {
5648         version (std_uni_stats)
5649         {
5650             import std.stdio : writefln, writeln;
5651             writeln("---TRIE FOOTPRINT STATS---");
5652             static foreach (i; 0 .. t.table.dim)
5653             {
5654                 writefln("lvl%s = %s bytes;  %s pages"
5655                          , i, t.bytes!i, t.pages!i);
5656             }
5657             writefln("TOTAL: %s bytes", t.bytes);
5658             version (none)
5659             {
5660                 writeln("INDEX (excluding value level):");
5661                 static foreach (i; 0 .. t.table.dim-1)
5662                     writeln(t.table.slice!(i)[0 .. t.table.length!i]);
5663             }
5664             writeln("---------------------------");
5665         }
5666     }
5667     //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
5668     // alias lo8   = assumeSize!(8, function (uint x) { return x&0xFF; });
5669     // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
5670     alias Set = CodepointSet;
5671     auto set = Set('A','Z','a','z');
5672     auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
5673     for (int a='a'; a<'z';a++)
5674         assert(trie[a]);
5675     for (int a='A'; a<'Z';a++)
5676         assert(trie[a]);
5677     for (int a=0; a<'A'; a++)
5678         assert(!trie[a]);
5679     for (int a ='Z'; a<'a'; a++)
5680         assert(!trie[a]);
5681     trieStats(trie);
5682 
5683     auto redundant2 = Set(
5684         1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
5685     auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
5686     trieStats(trie2);
5687     foreach (e; redundant2.byCodepoint)
5688         assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
5689     foreach (i; 0 .. 1024)
5690     {
5691         assert(trie2[i] == (i in redundant2));
5692     }
5693 
5694 
5695     auto redundant3 = Set(
5696           2,    4,    6,    8,    16,
5697        2+16, 4+16, 16+6, 16+8, 16+16,
5698        2+32, 4+32, 32+6, 32+8,
5699       );
5700 
5701     enum max3 = 256;
5702     // sliceBits
5703     auto trie3 = buildTrie!(bool, uint, max3,
5704             sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
5705         )(redundant3.byInterval);
5706     trieStats(trie3);
5707     foreach (i; 0 .. max3)
5708         assert(trie3[i] == (i in redundant3), text(cast(uint) i));
5709 
5710     auto redundant4 = Set(
5711             10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
5712             1000, 2000, 3000, 4000, 5000, 6000
5713         );
5714     enum max4 = 2^^16;
5715     auto trie4 = buildTrie!(bool, size_t, max4,
5716             sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
5717         )(redundant4.byInterval);
5718     foreach (i; 0 .. max4)
5719     {
5720         if (i in redundant4)
5721             assert(trie4[i], text(cast(uint) i));
5722     }
5723     trieStats(trie4);
5724 
5725         alias mapToS = mapTrieIndex!(useItemAt!(0, char));
5726         string[] redundantS = ["tea", "start", "orange"];
5727         redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
5728         auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
5729         // using first char only
5730         assert(redundantS == ["orange", "start", "tea"]);
5731         assert(strie["test"], text(strie["test"]));
5732         assert(!strie["aea"]);
5733         assert(strie["s"]);
5734 
5735     // a bit size test
5736     auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
5737     auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
5738     trieStats(bt);
5739     foreach (i; 0 .. 256)
5740         assert(bt[cast(ubyte) i]);
5741 }
5742 
5743 template useItemAt(size_t idx, T)
5744 if (isIntegral!T || is(T: dchar))
5745 {
5746     size_t impl(const scope T[] arr){ return arr[idx]; }
5747     alias useItemAt = assumeSize!(impl, 8*T.sizeof);
5748 }
5749 
5750 template useLastItem(T)
5751 {
5752     size_t impl(const scope T[] arr){ return arr[$-1]; }
5753     alias useLastItem = assumeSize!(impl, 8*T.sizeof);
5754 }
5755 
5756 template fullBitSize(Prefix...)
5757 {
5758     static if (Prefix.length > 0)
5759         enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
5760     else
5761         enum fullBitSize = 0;
5762 }
5763 
5764 template idxTypes(Key, size_t fullBits, Prefix...)
5765 {
5766     static if (Prefix.length == 1)
5767     {// the last level is value level, so no index once reduced to 1-level
5768         alias idxTypes = AliasSeq!();
5769     }
5770     else
5771     {
5772         // Important note on bit packing
5773         // Each level has to hold enough of bits to address the next one
5774         // The bottom level is known to hold full bit width
5775         // thus it's size in pages is full_bit_width - size_of_last_prefix
5776         // Recourse on this notion
5777         alias idxTypes =
5778             AliasSeq!(
5779                 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
5780                 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
5781             );
5782     }
5783 }
5784 
5785 //============================================================================
5786 
5787 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
5788 if (is(Char1 : dchar) && is(Char2 : dchar))
5789 {
5790     import std.algorithm.comparison : cmp;
5791     import std.algorithm.iteration : map, filter;
5792     import std.ascii : toLower;
5793     static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
5794     return cmp(
5795         a.map!toLower.filter!pred,
5796         b.map!toLower.filter!pred);
5797 }
5798 
5799 @safe pure unittest
5800 {
5801     assert(!comparePropertyName("foo-bar", "fooBar"));
5802 }
5803 
5804 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
5805 if (is(Char1 : dchar) && is(Char2 : dchar))
5806 {
5807     return comparePropertyName(a, b) < 0;
5808 }
5809 
5810 //============================================================================
5811 // Utilities for compression of Unicode code point sets
5812 //============================================================================
5813 
5814 @safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow
5815 {
5816     // not optimized as usually done 1 time (and not public interface)
5817     if (val < 128)
5818         arr ~= cast(ubyte) val;
5819     else if (val < (1 << 13))
5820     {
5821         arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
5822         arr ~= val & 0xFF;
5823     }
5824     else
5825     {
5826         assert(val < (1 << 21));
5827         arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
5828         arr ~= (val >> 8) & 0xFF;
5829         arr ~= val  & 0xFF;
5830     }
5831 }
5832 
5833 @safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure
5834 {
5835     import std.exception : enforce;
5836     immutable first = arr[idx++];
5837     if (!(first & 0x80)) // no top bit -> [0 .. 127]
5838         return first;
5839     immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
5840     uint val = (first & 0x1F);
5841     enforce(idx + extra <= arr.length, "bad code point interval encoding");
5842     foreach (j; 0 .. extra)
5843         val = (val << 8) | arr[idx+j];
5844     idx += extra;
5845     return val;
5846 }
5847 
5848 
5849 package(std) ubyte[] compressIntervals(Range)(Range intervals)
5850 if (isInputRange!Range && isIntegralPair!(ElementType!Range))
5851 {
5852     ubyte[] storage;
5853     uint base = 0;
5854     // RLE encode
5855     foreach (val; intervals)
5856     {
5857         compressTo(val[0]-base, storage);
5858         base = val[0];
5859         if (val[1] != lastDchar+1) // till the end of the domain so don't store it
5860         {
5861             compressTo(val[1]-base, storage);
5862             base = val[1];
5863         }
5864     }
5865     return storage;
5866 }
5867 
5868 @safe pure unittest
5869 {
5870     import std.algorithm.comparison : equal;
5871     import std.typecons : tuple;
5872 
5873     auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
5874     ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
5875     assert(compressIntervals(run) == enc);
5876     auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
5877     ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
5878     assert(compressIntervals(run2) == enc2);
5879     size_t  idx = 0;
5880     assert(decompressFrom(enc, idx) == 80);
5881     assert(decompressFrom(enc, idx) == 47);
5882     assert(decompressFrom(enc, idx) == 1);
5883     assert(decompressFrom(enc, idx) == (1 << 10));
5884     idx = 0;
5885     assert(decompressFrom(enc2, idx) == 0);
5886     assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
5887     assert(equal(decompressIntervals(compressIntervals(run)), run));
5888     assert(equal(decompressIntervals(compressIntervals(run2)), run2));
5889 }
5890 
5891 // Creates a range of `CodepointInterval` that lazily decodes compressed data.
5892 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure
5893 {
5894     return DecompressedIntervals(data);
5895 }
5896 
5897 @safe struct DecompressedIntervals
5898 {
5899 pure:
5900     const(ubyte)[] _stream;
5901     size_t _idx;
5902     CodepointInterval _front;
5903 
5904     this(const(ubyte)[] stream)
5905     {
5906         _stream = stream;
5907         popFront();
5908     }
5909 
5910     @property CodepointInterval front()
5911     {
5912         assert(!empty);
5913         return _front;
5914     }
5915 
5916     void popFront()
5917     {
5918         if (_idx == _stream.length)
5919         {
5920             _idx = size_t.max;
5921             return;
5922         }
5923         uint base = _front[1];
5924         _front[0] = base + decompressFrom(_stream, _idx);
5925         if (_idx == _stream.length)// odd length ---> till the end
5926             _front[1] = lastDchar+1;
5927         else
5928         {
5929             base = _front[0];
5930             _front[1] = base + decompressFrom(_stream, _idx);
5931         }
5932     }
5933 
5934     @property bool empty() const
5935     {
5936         return _idx == size_t.max;
5937     }
5938 
5939     @property DecompressedIntervals save() return scope { return this; }
5940 }
5941 
5942 @safe pure nothrow @nogc unittest
5943 {
5944     static assert(isInputRange!DecompressedIntervals);
5945     static assert(isForwardRange!DecompressedIntervals);
5946 }
5947 
5948 //============================================================================
5949 
5950 version (std_uni_bootstrap){}
5951 else
5952 {
5953 
5954 // helper for looking up code point sets
5955 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name)
5956 {
5957     import std.algorithm.iteration : map;
5958     import std.range : assumeSorted;
5959     auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
5960         (table.map!"a.name"());
5961     size_t idx = range.lowerBound(name).length;
5962     if (idx < range.length && comparePropertyName(range[idx], name) == 0)
5963         return idx;
5964     return -1;
5965 }
5966 
5967 // another one that loads it
5968 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest)
5969 {
5970     auto idx = findUnicodeSet!table(name);
5971     if (idx >= 0)
5972     {
5973         dest = Set(asSet(table[idx].compressed));
5974         return true;
5975     }
5976     return false;
5977 }
5978 
5979 bool loadProperty(Set=CodepointSet, C)
5980     (const scope C[] name, ref Set target) pure
5981 {
5982     import std.internal.unicode_tables : uniProps; // generated file
5983     alias ucmp = comparePropertyName;
5984     // conjure cumulative properties by hand
5985     if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
5986     {
5987         target = asSet(uniProps.Lu);
5988         target |= asSet(uniProps.Ll);
5989         target |= asSet(uniProps.Lt);
5990         target |= asSet(uniProps.Lo);
5991         target |= asSet(uniProps.Lm);
5992     }
5993     else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
5994     {
5995         target = asSet(uniProps.Ll);
5996         target |= asSet(uniProps.Lu);
5997         target |= asSet(uniProps.Lt);// Title case
5998     }
5999     else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
6000     {
6001         target = asSet(uniProps.Mn);
6002         target |= asSet(uniProps.Mc);
6003         target |= asSet(uniProps.Me);
6004     }
6005     else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
6006     {
6007         target = asSet(uniProps.Nd);
6008         target |= asSet(uniProps.Nl);
6009         target |= asSet(uniProps.No);
6010     }
6011     else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
6012     {
6013         target = asSet(uniProps.Pc);
6014         target |= asSet(uniProps.Pd);
6015         target |= asSet(uniProps.Ps);
6016         target |= asSet(uniProps.Pe);
6017         target |= asSet(uniProps.Pi);
6018         target |= asSet(uniProps.Pf);
6019         target |= asSet(uniProps.Po);
6020     }
6021     else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
6022     {
6023         target = asSet(uniProps.Sm);
6024         target |= asSet(uniProps.Sc);
6025         target |= asSet(uniProps.Sk);
6026         target |= asSet(uniProps.So);
6027     }
6028     else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
6029     {
6030         target = asSet(uniProps.Zs);
6031         target |= asSet(uniProps.Zl);
6032         target |= asSet(uniProps.Zp);
6033     }
6034     else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
6035     {
6036         target = asSet(uniProps.Cc);
6037         target |= asSet(uniProps.Cf);
6038         target |= asSet(uniProps.Cs);
6039         target |= asSet(uniProps.Co);
6040         target |= asSet(uniProps.Cn);
6041     }
6042     else if (ucmp(name, "graphical") == 0)
6043     {
6044         target = asSet(uniProps.Alphabetic);
6045 
6046         target |= asSet(uniProps.Mn);
6047         target |= asSet(uniProps.Mc);
6048         target |= asSet(uniProps.Me);
6049 
6050         target |= asSet(uniProps.Nd);
6051         target |= asSet(uniProps.Nl);
6052         target |= asSet(uniProps.No);
6053 
6054         target |= asSet(uniProps.Pc);
6055         target |= asSet(uniProps.Pd);
6056         target |= asSet(uniProps.Ps);
6057         target |= asSet(uniProps.Pe);
6058         target |= asSet(uniProps.Pi);
6059         target |= asSet(uniProps.Pf);
6060         target |= asSet(uniProps.Po);
6061 
6062         target |= asSet(uniProps.Zs);
6063 
6064         target |= asSet(uniProps.Sm);
6065         target |= asSet(uniProps.Sc);
6066         target |= asSet(uniProps.Sk);
6067         target |= asSet(uniProps.So);
6068     }
6069     else if (ucmp(name, "any") == 0)
6070         target = Set.fromIntervals(0, 0x110000);
6071     else if (ucmp(name, "ascii") == 0)
6072         target = Set.fromIntervals(0, 0x80);
6073     else
6074         return loadUnicodeSet!(uniProps.tab)(name, target);
6075     return true;
6076 }
6077 
6078 // CTFE-only helper for checking property names at compile-time
6079 @safe bool isPrettyPropertyName(C)(const scope C[] name)
6080 {
6081     import std.algorithm.searching : find;
6082     auto names = [
6083         "L", "Letter",
6084         "LC", "Cased Letter",
6085         "M", "Mark",
6086         "N", "Number",
6087         "P", "Punctuation",
6088         "S", "Symbol",
6089         "Z", "Separator",
6090         "Graphical",
6091         "any",
6092         "ascii"
6093     ];
6094     auto x = find!(x => comparePropertyName(x, name) == 0)(names);
6095     return !x.empty;
6096 }
6097 
6098 // ditto, CTFE-only, not optimized
6099 @safe private static bool findSetName(alias table, C)(const scope C[] name)
6100 {
6101     return findUnicodeSet!table(name) >= 0;
6102 }
6103 
6104 template SetSearcher(alias table, string kind)
6105 {
6106     /// Run-time checked search.
6107     static auto opCall(C)(const scope C[] name)
6108         if (is(C : dchar))
6109     {
6110         import std.conv : to;
6111         CodepointSet set;
6112         if (loadUnicodeSet!table(name, set))
6113             return set;
6114         throw new Exception("No unicode set for "~kind~" by name "
6115             ~name.to!string()~" was found.");
6116     }
6117     /// Compile-time checked search.
6118     static @property auto opDispatch(string name)()
6119     {
6120         static if (findSetName!table(name))
6121         {
6122             CodepointSet set;
6123             loadUnicodeSet!table(name, set);
6124             return set;
6125         }
6126         else
6127             static assert(false, "No unicode set for "~kind~" by name "
6128                 ~name~" was found.");
6129     }
6130 }
6131 
6132 // Characters that need escaping in string posed as regular expressions
6133 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
6134     ';', ':', '#', '&', '%', '/', '<', '>', '`',  '*', '+', '(', ')', '{', '}',  '~');
6135 
6136 package(std) CodepointSet memoizeExpr(string expr)()
6137 {
6138     if (__ctfe)
6139         return mixin(expr);
6140     alias T = typeof(mixin(expr));
6141     static T slot;
6142     static bool initialized;
6143     if (!initialized)
6144     {
6145         slot =  mixin(expr);
6146         initialized = true;
6147     }
6148     return slot;
6149 }
6150 
6151 //property for \w character class
6152 package(std) @property CodepointSet wordCharacter() @safe
6153 {
6154     return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
6155         | unicode.Me | unicode.Nd | unicode.Pc")();
6156 }
6157 
6158 //basic stack, just in case it gets used anywhere else then Parser
6159 package(std) struct Stack(T)
6160 {
6161 @safe:
6162     T[] data;
6163     @property bool empty(){ return data.empty; }
6164 
6165     @property size_t length(){ return data.length; }
6166 
6167     void push(T val){ data ~= val;  }
6168 
6169     @trusted T pop()
6170     {
6171         assert(!empty);
6172         auto val = data[$ - 1];
6173         data = data[0 .. $ - 1];
6174         if (!__ctfe)
6175             cast(void) data.assumeSafeAppend();
6176         return val;
6177     }
6178 
6179     @property ref T top()
6180     {
6181         assert(!empty);
6182         return data[$ - 1];
6183     }
6184 }
6185 
6186 //test if a given string starts with hex number of maxDigit that's a valid codepoint
6187 //returns it's value and skips these maxDigit chars on success, throws on failure
6188 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit)
6189 {
6190     import std.exception : enforce;
6191     //std.conv.parse is both @system and bogus
6192     uint val;
6193     for (int k = 0; k < maxDigit; k++)
6194     {
6195         enforce(!str.empty, "incomplete escape sequence");
6196         //accepts ascii only, so it's OK to index directly
6197         immutable current = str.front;
6198         if ('0' <= current && current <= '9')
6199             val = val * 16 + current - '0';
6200         else if ('a' <= current && current <= 'f')
6201             val = val * 16 + current -'a' + 10;
6202         else if ('A' <= current && current <= 'F')
6203             val = val * 16 + current - 'A' + 10;
6204         else
6205             throw new Exception("invalid escape sequence");
6206         str.popFront();
6207     }
6208     enforce(val <= 0x10FFFF, "invalid codepoint");
6209     return val;
6210 }
6211 
6212 @safe unittest
6213 {
6214     import std.algorithm.searching : canFind;
6215     import std.exception : collectException;
6216     string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
6217     string[] hex = [ "01", "ff", "00af", "10FFFF" ];
6218     int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
6219     foreach (v; non_hex)
6220         assert(collectException(parseUniHex(v, v.length)).msg
6221           .canFind("invalid escape sequence"));
6222     foreach (i, v; hex)
6223         assert(parseUniHex(v, v.length) == value[i]);
6224     string over = "0011FFFF";
6225     assert(collectException(parseUniHex(over, over.length)).msg
6226       .canFind("invalid codepoint"));
6227 }
6228 
6229 auto caseEnclose(CodepointSet set)
6230 {
6231     auto cased = set & unicode.LC;
6232     foreach (dchar ch; cased.byCodepoint)
6233     {
6234         foreach (c; simpleCaseFoldings(ch))
6235             set |= c;
6236     }
6237     return set;
6238 }
6239 
6240 /+
6241     fetch codepoint set corresponding to a name (InBlock or binary property)
6242 +/
6243 CodepointSet getUnicodeSet(const scope char[] name, bool negated,  bool casefold) @safe
6244 {
6245     CodepointSet s = unicode(name);
6246     //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
6247     if (casefold)
6248        s = caseEnclose(s);
6249     if (negated)
6250         s = s.inverted;
6251     return s;
6252 }
6253 
6254 struct UnicodeSetParser(Range)
6255 {
6256     import std.exception : enforce;
6257     import std.typecons : tuple, Tuple;
6258     Range range;
6259     bool casefold_;
6260 
6261     @property bool empty(){ return range.empty; }
6262     @property dchar front(){ return range.front; }
6263     void popFront(){ range.popFront(); }
6264 
6265     //CodepointSet operations relatively in order of priority
6266     enum Operator:uint {
6267         Open = 0, Negate,  Difference, SymDifference, Intersection, Union, None
6268     }
6269 
6270     //parse unit of CodepointSet spec, most notably escape sequences and char ranges
6271     //also fetches next set operation
6272     Tuple!(CodepointSet,Operator) parseCharTerm()
6273     {
6274         import std.range : drop;
6275         enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
6276         enum State{ Start, Char, Escape, CharDash, CharDashEscape,
6277             PotentialTwinSymbolOperator }
6278         Operator op = Operator.None;
6279         dchar last;
6280         CodepointSet set;
6281         State state = State.Start;
6282 
6283         void addWithFlags(ref CodepointSet set, uint ch)
6284         {
6285             if (casefold_)
6286             {
6287                 auto foldings = simpleCaseFoldings(ch);
6288                 foreach (v; foldings)
6289                     set |= v;
6290             }
6291             else
6292                 set |= ch;
6293         }
6294 
6295         static Operator twinSymbolOperator(dchar symbol)
6296         {
6297             switch (symbol)
6298             {
6299             case '|':
6300                 return Operator.Union;
6301             case '-':
6302                 return Operator.Difference;
6303             case '~':
6304                 return Operator.SymDifference;
6305             case '&':
6306                 return Operator.Intersection;
6307             default:
6308                 assert(false);
6309             }
6310         }
6311 
6312         L_CharTermLoop:
6313         for (;;)
6314         {
6315             final switch (state)
6316             {
6317             case State.Start:
6318                 switch (front)
6319                 {
6320                 case '|':
6321                 case '-':
6322                 case '~':
6323                 case '&':
6324                     state = State.PotentialTwinSymbolOperator;
6325                     last = front;
6326                     break;
6327                 case '[':
6328                     op = Operator.Union;
6329                     goto case;
6330                 case ']':
6331                     break L_CharTermLoop;
6332                 case '\\':
6333                     state = State.Escape;
6334                     break;
6335                 default:
6336                     state = State.Char;
6337                     last = front;
6338                 }
6339                 break;
6340             case State.Char:
6341                 // xxx last front xxx
6342                 switch (front)
6343                 {
6344                 case '|':
6345                 case '~':
6346                 case '&':
6347                     // then last is treated as normal char and added as implicit union
6348                     state = State.PotentialTwinSymbolOperator;
6349                     addWithFlags(set, last);
6350                     last = front;
6351                     break;
6352                 case '-': // still need more info
6353                     state = State.CharDash;
6354                     break;
6355                 case '\\':
6356                     set |= last;
6357                     state = State.Escape;
6358                     break;
6359                 case '[':
6360                     op = Operator.Union;
6361                     goto case;
6362                 case ']':
6363                     addWithFlags(set, last);
6364                     break L_CharTermLoop;
6365                 default:
6366                     state = State.Char;
6367                     addWithFlags(set, last);
6368                     last = front;
6369                 }
6370                 break;
6371             case State.PotentialTwinSymbolOperator:
6372                 // xxx last front xxxx
6373                 // where last = [|-&~]
6374                 if (front == last)
6375                 {
6376                     op = twinSymbolOperator(last);
6377                     popFront();//skip second twin char
6378                     break L_CharTermLoop;
6379                 }
6380                 goto case State.Char;
6381             case State.Escape:
6382                 // xxx \ front xxx
6383                 switch (front)
6384                 {
6385                 case 'f':
6386                     last = '\f';
6387                     state = State.Char;
6388                     break;
6389                 case 'n':
6390                     last = '\n';
6391                     state = State.Char;
6392                     break;
6393                 case 'r':
6394                     last = '\r';
6395                     state = State.Char;
6396                     break;
6397                 case 't':
6398                     last = '\t';
6399                     state = State.Char;
6400                     break;
6401                 case 'v':
6402                     last = '\v';
6403                     state = State.Char;
6404                     break;
6405                 case 'c':
6406                     last = unicode.parseControlCode(this);
6407                     state = State.Char;
6408                     break;
6409                 foreach (val; Escapables)
6410                 {
6411                 case val:
6412                 }
6413                     last = front;
6414                     state = State.Char;
6415                     break;
6416                 case 'p':
6417                     set.add(unicode.parsePropertySpec(this, false, casefold_));
6418                     state = State.Start;
6419                     continue L_CharTermLoop; //next char already fetched
6420                 case 'P':
6421                     set.add(unicode.parsePropertySpec(this, true, casefold_));
6422                     state = State.Start;
6423                     continue L_CharTermLoop; //next char already fetched
6424                 case 'x':
6425                     popFront();
6426                     last = parseUniHex(this, 2);
6427                     state = State.Char;
6428                     continue L_CharTermLoop;
6429                 case 'u':
6430                     popFront();
6431                     last = parseUniHex(this, 4);
6432                     state = State.Char;
6433                     continue L_CharTermLoop;
6434                 case 'U':
6435                     popFront();
6436                     last = parseUniHex(this, 8);
6437                     state = State.Char;
6438                     continue L_CharTermLoop;
6439                 case 'd':
6440                     set.add(unicode.Nd);
6441                     state = State.Start;
6442                     break;
6443                 case 'D':
6444                     set.add(unicode.Nd.inverted);
6445                     state = State.Start;
6446                     break;
6447                 case 's':
6448                     set.add(unicode.White_Space);
6449                     state = State.Start;
6450                     break;
6451                 case 'S':
6452                     set.add(unicode.White_Space.inverted);
6453                     state = State.Start;
6454                     break;
6455                 case 'w':
6456                     set.add(wordCharacter);
6457                     state = State.Start;
6458                     break;
6459                 case 'W':
6460                     set.add(wordCharacter.inverted);
6461                     state = State.Start;
6462                     break;
6463                 default:
6464                     if (front >= privateUseStart && front <= privateUseEnd)
6465                         enforce(false, "no matching ']' found while parsing character class");
6466                     enforce(false, "invalid escape sequence");
6467                 }
6468                 break;
6469             case State.CharDash:
6470                 // xxx last - front xxx
6471                 switch (front)
6472                 {
6473                 case '[':
6474                     op = Operator.Union;
6475                     goto case;
6476                 case ']':
6477                     //means dash is a single char not an interval specifier
6478                     addWithFlags(set, last);
6479                     addWithFlags(set, '-');
6480                     break L_CharTermLoop;
6481                  case '-'://set Difference again
6482                     addWithFlags(set, last);
6483                     op = Operator.Difference;
6484                     popFront();//skip '-'
6485                     break L_CharTermLoop;
6486                 case '\\':
6487                     state = State.CharDashEscape;
6488                     break;
6489                 default:
6490                     enforce(last <= front, "inverted range");
6491                     if (casefold_)
6492                     {
6493                         for (uint ch = last; ch <= front; ch++)
6494                             addWithFlags(set, ch);
6495                     }
6496                     else
6497                         set.add(last, front + 1);
6498                     state = State.Start;
6499                 }
6500                 break;
6501             case State.CharDashEscape:
6502             //xxx last - \ front xxx
6503                 uint end;
6504                 switch (front)
6505                 {
6506                 case 'f':
6507                     end = '\f';
6508                     break;
6509                 case 'n':
6510                     end = '\n';
6511                     break;
6512                 case 'r':
6513                     end = '\r';
6514                     break;
6515                 case 't':
6516                     end = '\t';
6517                     break;
6518                 case 'v':
6519                     end = '\v';
6520                     break;
6521                 foreach (val; Escapables)
6522                 {
6523                 case val:
6524                 }
6525                     end = front;
6526                     break;
6527                 case 'c':
6528                     end = unicode.parseControlCode(this);
6529                     break;
6530                 case 'x':
6531                     popFront();
6532                     end = parseUniHex(this, 2);
6533                     enforce(last <= end,"inverted range");
6534                     set.add(last, end + 1);
6535                     state = State.Start;
6536                     continue L_CharTermLoop;
6537                 case 'u':
6538                     popFront();
6539                     end = parseUniHex(this, 4);
6540                     enforce(last <= end,"inverted range");
6541                     set.add(last, end + 1);
6542                     state = State.Start;
6543                     continue L_CharTermLoop;
6544                 case 'U':
6545                     popFront();
6546                     end = parseUniHex(this, 8);
6547                     enforce(last <= end,"inverted range");
6548                     set.add(last, end + 1);
6549                     state = State.Start;
6550                     continue L_CharTermLoop;
6551                 default:
6552                     if (front >= privateUseStart && front <= privateUseEnd)
6553                         enforce(false, "no matching ']' found while parsing character class");
6554                     enforce(false, "invalid escape sequence");
6555                 }
6556                 // Lookahead to check if it's a \T
6557                 // where T is sub-pattern terminator in multi-pattern scheme
6558                 auto lookahead = range.save.drop(1);
6559                 if (end == '\\' && !lookahead.empty)
6560                 {
6561                     if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd)
6562                         enforce(false, "no matching ']' found while parsing character class");
6563                 }
6564                 enforce(last <= end,"inverted range");
6565                 set.add(last, end + 1);
6566                 state = State.Start;
6567                 break;
6568             }
6569             popFront();
6570             enforce(!empty, "unexpected end of CodepointSet");
6571         }
6572         return tuple(set, op);
6573     }
6574 
6575     alias ValStack = Stack!(CodepointSet);
6576     alias OpStack = Stack!(Operator);
6577 
6578     CodepointSet parseSet()
6579     {
6580         ValStack vstack;
6581         OpStack opstack;
6582         import std.functional : unaryFun;
6583         enforce(!empty, "unexpected end of input");
6584         enforce(front == '[', "expected '[' at the start of unicode set");
6585         //
6586         static bool apply(Operator op, ref ValStack stack)
6587         {
6588             switch (op)
6589             {
6590             case Operator.Negate:
6591                 enforce(!stack.empty, "no operand for '^'");
6592                 stack.top = stack.top.inverted;
6593                 break;
6594             case Operator.Union:
6595                 auto s = stack.pop();//2nd operand
6596                 enforce(!stack.empty, "no operand for '||'");
6597                 stack.top.add(s);
6598                 break;
6599             case Operator.Difference:
6600                 auto s = stack.pop();//2nd operand
6601                 enforce(!stack.empty, "no operand for '--'");
6602                 stack.top.sub(s);
6603                 break;
6604             case Operator.SymDifference:
6605                 auto s = stack.pop();//2nd operand
6606                 enforce(!stack.empty, "no operand for '~~'");
6607                 stack.top ~= s;
6608                 break;
6609             case Operator.Intersection:
6610                 auto s = stack.pop();//2nd operand
6611                 enforce(!stack.empty, "no operand for '&&'");
6612                 stack.top.intersect(s);
6613                 break;
6614             default:
6615                 return false;
6616             }
6617             return true;
6618         }
6619         static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
6620         {
6621             while (cond(opstack.top))
6622             {
6623                 if (!apply(opstack.pop(),vstack))
6624                     return false;//syntax error
6625                 if (opstack.empty)
6626                     return false;
6627             }
6628             return true;
6629         }
6630 
6631         L_CharsetLoop:
6632         do
6633         {
6634             switch (front)
6635             {
6636             case '[':
6637                 opstack.push(Operator.Open);
6638                 popFront();
6639                 enforce(!empty, "unexpected end of character class");
6640                 if (front == '^')
6641                 {
6642                     opstack.push(Operator.Negate);
6643                     popFront();
6644                     enforce(!empty, "unexpected end of character class");
6645                 }
6646                 else if (front == ']') // []...] is special cased
6647                 {
6648                     popFront();
6649                     enforce(!empty, "wrong character set");
6650                     auto pair = parseCharTerm();
6651                     pair[0].add(']', ']'+1);
6652                     if (pair[1] != Operator.None)
6653                     {
6654                         if (opstack.top == Operator.Union)
6655                             unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6656                         opstack.push(pair[1]);
6657                     }
6658                     vstack.push(pair[0]);
6659                 }
6660                 break;
6661             case ']':
6662                 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
6663                     "character class syntax error");
6664                 enforce(!opstack.empty, "unmatched ']'");
6665                 opstack.pop();
6666                 popFront();
6667                 if (opstack.empty)
6668                     break L_CharsetLoop;
6669                 auto pair  = parseCharTerm();
6670                 if (!pair[0].empty)//not only operator e.g. -- or ~~
6671                 {
6672                     vstack.top.add(pair[0]);//apply union
6673                 }
6674                 if (pair[1] != Operator.None)
6675                 {
6676                     if (opstack.top == Operator.Union)
6677                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6678                     opstack.push(pair[1]);
6679                 }
6680                 break;
6681             //
6682             default://yet another pair of term(op)?
6683                 auto pair = parseCharTerm();
6684                 if (pair[1] != Operator.None)
6685                 {
6686                     if (opstack.top == Operator.Union)
6687                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6688                     opstack.push(pair[1]);
6689                 }
6690                 vstack.push(pair[0]);
6691             }
6692 
6693         }while (!empty || !opstack.empty);
6694         while (!opstack.empty)
6695             apply(opstack.pop(),vstack);
6696         assert(vstack.length == 1);
6697         return vstack.top;
6698     }
6699 }
6700 
6701 /**
6702     A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
6703     a block, script or general category.
6704 
6705     It uses well defined standard rules of property name lookup.
6706     This includes fuzzy matching of names, so that
6707     'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
6708     and yield the same set of white space $(CHARACTERS).
6709 */
6710 @safe public struct unicode
6711 {
6712     import std.exception : enforce;
6713     /**
6714         Performs the lookup of set of $(CODEPOINTS)
6715         with compile-time correctness checking.
6716         This short-cut version combines 3 searches:
6717         across blocks, scripts, and common binary properties.
6718 
6719         Note that since scripts and blocks overlap the
6720         usual trick to disambiguate is used - to get a block use
6721         `unicode.InBlockName`, to search a script
6722         use `unicode.ScriptName`.
6723 
6724         See_Also: $(LREF block), $(LREF script)
6725         and (not included in this search) $(LREF hangulSyllableType).
6726     */
6727 
6728     static @property auto opDispatch(string name)() pure
6729     {
6730         static if (findAny(name))
6731             return loadAny(name);
6732         else
6733             static assert(false, "No unicode set by name "~name~" was found.");
6734     }
6735 
6736     ///
6737     @safe unittest
6738     {
6739         import std.exception : collectException;
6740         auto ascii = unicode.ASCII;
6741         assert(ascii['A']);
6742         assert(ascii['~']);
6743         assert(!ascii['\u00e0']);
6744         // matching is case-insensitive
6745         assert(ascii == unicode.ascII);
6746         assert(!ascii['à']);
6747         // underscores, '-' and whitespace in names are ignored too
6748         auto latin = unicode.in_latin1_Supplement;
6749         assert(latin['à']);
6750         assert(!latin['$']);
6751         // BTW Latin 1 Supplement is a block, hence "In" prefix
6752         assert(latin == unicode("In Latin 1 Supplement"));
6753         // run-time look up throws if no such set is found
6754         assert(collectException(unicode("InCyrilliac")));
6755     }
6756 
6757     /**
6758         The same lookup across blocks, scripts, or binary properties,
6759         but performed at run-time.
6760         This version is provided for cases where `name`
6761         is not known beforehand; otherwise compile-time
6762         checked $(LREF opDispatch) is typically a better choice.
6763 
6764         See the $(S_LINK Unicode properties, table of properties) for available
6765         sets.
6766     */
6767     static auto opCall(C)(const scope C[] name)
6768         if (is(C : dchar))
6769     {
6770         return loadAny(name);
6771     }
6772 
6773     /**
6774         Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
6775 
6776         Note:
6777         Here block names are unambiguous as no scripts are searched
6778         and thus to search use simply `unicode.block.BlockName` notation.
6779 
6780         See $(S_LINK Unicode properties, table of properties) for available sets.
6781         See_Also: $(S_LINK Unicode properties, table of properties).
6782     */
6783     struct block
6784     {
6785         import std.internal.unicode_tables : blocks; // generated file
6786         mixin SetSearcher!(blocks.tab, "block");
6787     }
6788 
6789     ///
6790     @safe unittest
6791     {
6792         // use .block for explicitness
6793         assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
6794     }
6795 
6796     /**
6797         Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
6798 
6799         See the $(S_LINK Unicode properties, table of properties) for available
6800         sets.
6801     */
6802     struct script
6803     {
6804         import std.internal.unicode_tables : scripts; // generated file
6805         mixin SetSearcher!(scripts.tab, "script");
6806     }
6807 
6808     ///
6809     @safe unittest
6810     {
6811         auto arabicScript = unicode.script.arabic;
6812         auto arabicBlock = unicode.block.arabic;
6813         // there is an intersection between script and block
6814         assert(arabicBlock['؁']);
6815         assert(arabicScript['؁']);
6816         // but they are different
6817         assert(arabicBlock != arabicScript);
6818         assert(arabicBlock == unicode.inArabic);
6819         assert(arabicScript == unicode.arabic);
6820     }
6821 
6822     /**
6823         Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
6824 
6825         Other non-binary properties (once supported) follow the same
6826         notation - `unicode.propertyName.propertyValue` for compile-time
6827         checked access and `unicode.propertyName(propertyValue)`
6828         for run-time checked one.
6829 
6830         See the $(S_LINK Unicode properties, table of properties) for available
6831         sets.
6832     */
6833     struct hangulSyllableType
6834     {
6835         import std.internal.unicode_tables : hangul; // generated file
6836         mixin SetSearcher!(hangul.tab, "hangul syllable type");
6837     }
6838 
6839     ///
6840     @safe unittest
6841     {
6842         // L here is syllable type not Letter as in unicode.L short-cut
6843         auto leadingVowel = unicode.hangulSyllableType("L");
6844         // check that some leading vowels are present
6845         foreach (vowel; '\u1110'..'\u115F')
6846             assert(leadingVowel[vowel]);
6847         assert(leadingVowel == unicode.hangulSyllableType.L);
6848     }
6849 
6850     //parse control code of form \cXXX, c assumed to be the current symbol
6851     static package(std) dchar parseControlCode(Parser)(ref Parser p)
6852     {
6853         with(p)
6854         {
6855             popFront();
6856             enforce(!empty, "Unfinished escape sequence");
6857             enforce(('a' <= front && front <= 'z')
6858                 || ('A' <= front && front <= 'Z'),
6859             "Only letters are allowed after \\c");
6860             return front & 0x1f;
6861         }
6862     }
6863 
6864     //parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
6865     //\ - assumed to be processed, p - is current
6866     static package(std) CodepointSet parsePropertySpec(Range)(ref Range p,
6867         bool negated, bool casefold)
6868     {
6869         static import std.ascii;
6870         with(p)
6871         {
6872             enum MAX_PROPERTY = 128;
6873             char[MAX_PROPERTY] result;
6874             uint k = 0;
6875             popFront();
6876             enforce(!empty, "eof parsing unicode property spec");
6877             if (front == '{')
6878             {
6879                 popFront();
6880                 while (k < MAX_PROPERTY && !empty && front !='}'
6881                     && front !=':')
6882                 {
6883                     if (front != '-' && front != ' ' && front != '_')
6884                         result[k++] = cast(char) std.ascii.toLower(front);
6885                     popFront();
6886                 }
6887                 enforce(k != MAX_PROPERTY, "invalid property name");
6888                 enforce(front == '}', "} expected ");
6889             }
6890             else
6891             {//single char properties e.g.: \pL, \pN ...
6892                 enforce(front < 0x80, "invalid property name");
6893                 result[k++] = cast(char) front;
6894             }
6895             auto s = getUnicodeSet(result[0 .. k], negated, casefold);
6896             enforce(!s.empty, "unrecognized unicode property spec");
6897             popFront();
6898             return s;
6899         }
6900     }
6901 
6902     /**
6903         Parse unicode codepoint set from given `range` using standard regex
6904         syntax '[...]'. The range is advanced skiping over regex set definition.
6905         `casefold` parameter determines if the set should be casefolded - that is
6906         include both lower and upper case versions for any letters in the set.
6907     */
6908     static CodepointSet parseSet(Range)(ref Range range, bool casefold=false)
6909     if (isInputRange!Range && is(ElementType!Range : dchar))
6910     {
6911         auto usParser = UnicodeSetParser!Range(range, casefold);
6912         auto set = usParser.parseSet();
6913         range = usParser.range;
6914         return set;
6915     }
6916 
6917     ///
6918     @safe unittest
6919     {
6920         import std.uni : unicode;
6921         string pat = "[a-zA-Z0-9]hello";
6922         auto set = unicode.parseSet(pat);
6923         // check some of the codepoints
6924         assert(set['a'] && set['A'] && set['9']);
6925         assert(pat == "hello");
6926     }
6927 
6928 private:
6929     alias ucmp = comparePropertyName;
6930 
6931     static bool findAny(string name)
6932     {
6933         import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
6934         return isPrettyPropertyName(name)
6935             || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
6936             || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
6937     }
6938 
6939     static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure
6940     {
6941         import std.conv : to;
6942         import std.internal.unicode_tables : blocks, scripts; // generated file
6943         Set set;
6944         immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
6945             || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
6946                 && loadUnicodeSet!(blocks.tab)(name[2..$], set));
6947         if (loaded)
6948             return set;
6949         throw new Exception("No unicode set by name "~name.to!string()~" was found.");
6950     }
6951 
6952     // FIXME: re-disable once the compiler is fixed
6953     // Disabled to prevent the mistake of creating instances of this pseudo-struct.
6954     //@disable ~this();
6955 }
6956 
6957 @safe unittest
6958 {
6959     import std.internal.unicode_tables : blocks, uniProps; // generated file
6960     assert(unicode("InHebrew") == asSet(blocks.Hebrew));
6961     assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
6962     assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
6963 }
6964 
6965 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
6966 
6967 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
6968 // Use combined trie instead of checking for '\r' | '\n' | ccTrie,
6969 //   or extend | '\u200D' separately
6970 
6971 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
6972 {
6973     return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
6974 }
6975 
6976 // Our grapheme decoder is a state machine, this is list of all possible
6977 // states before each code point.
6978 private enum GraphemeState
6979 {
6980     Start,
6981     CR,
6982     RI,
6983     L,
6984     V,
6985     LVT,
6986     Emoji,
6987     EmojiZWJ,
6988     Prepend,
6989     End
6990 }
6991 
6992 // Message values whether end of grapheme is reached
6993 private enum TransformRes
6994 {
6995     // No, unless the source range ends here
6996     // (GB2 - break at end of text, unless text is empty)
6997     goOn,
6998     redo, // Run last character again with new state
6999     retInclude, // Yes, after the just iterated character
7000     retExclude // Yes, before the just iterated character
7001 }
7002 
7003 // The logic of the grapheme decoding is all here
7004 // GB# means Grapheme Breaking rule number # - see Unicode standard annex #29
7005 // Note, getting GB1 (break at start of text, unless text is empty) right
7006 // relies on the user starting grapheme walking from beginning of the text, and
7007 // not attempting to walk an empty text.
7008 private immutable TransformRes
7009     function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms =
7010 [
7011     GraphemeState.Start: (ref state, ch)
7012     {
7013         // GB4. Break after controls.
7014         if (graphemeControlTrie[ch] || ch == '\n')
7015             return TransformRes.retInclude;
7016 
7017         with (GraphemeState) state =
7018             ch == '\r' ? CR :
7019             isRegionalIndicator(ch) ? RI :
7020             isHangL(ch) ? L :
7021             hangLV[ch] || isHangV(ch) ? V :
7022             hangLVT[ch] || isHangT(ch) ? LVT :
7023             prependTrie[ch] ? Prepend :
7024             xpictoTrie[ch] ? Emoji :
7025             End;
7026 
7027         // No matter what we encountered, we always include the
7028         // first code point in the grapheme.
7029         return TransformRes.goOn;
7030     },
7031 
7032     // GB3, GB4. Do not break between a CR and LF.
7033     // Otherwise, break after controls.
7034     GraphemeState.CR: (ref state, ch) => ch == '\n' ?
7035         TransformRes.retInclude :
7036         TransformRes.retExclude,
7037 
7038     // GB12 - GB13. Do not break within emoji flag sequences.
7039     // That is, do not break between regional indicator (RI) symbols if
7040     // there is an odd number of RI characters before the break point.
7041     // This state applies if one and only one RI code point has been
7042     // encountered.
7043     GraphemeState.RI: (ref state, ch)
7044     {
7045         state = GraphemeState.End;
7046 
7047         return isRegionalIndicator(ch) ?
7048             TransformRes.goOn :
7049             TransformRes.redo;
7050     },
7051 
7052     // GB6. Do not break Hangul syllable sequences.
7053     GraphemeState.L: (ref state, ch)
7054     {
7055         if (isHangL(ch))
7056             return TransformRes.goOn;
7057         else if (isHangV(ch) || hangLV[ch])
7058         {
7059             state = GraphemeState.V;
7060             return TransformRes.goOn;
7061         }
7062         else if (hangLVT[ch])
7063         {
7064             state = GraphemeState.LVT;
7065             return TransformRes.goOn;
7066         }
7067 
7068         state = GraphemeState.End;
7069         return TransformRes.redo;
7070     },
7071 
7072     // GB7. Do not break Hangul syllable sequences.
7073     GraphemeState.V: (ref state, ch)
7074     {
7075         if (isHangV(ch))
7076             return TransformRes.goOn;
7077         else if (isHangT(ch))
7078         {
7079             state = GraphemeState.LVT;
7080             return TransformRes.goOn;
7081         }
7082 
7083         state = GraphemeState.End;
7084         return TransformRes.redo;
7085     },
7086 
7087     // GB8. Do not break Hangul syllable sequences.
7088     GraphemeState.LVT: (ref state, ch)
7089     {
7090         if (isHangT(ch))
7091             return TransformRes.goOn;
7092 
7093         state = GraphemeState.End;
7094         return TransformRes.redo;
7095     },
7096 
7097     // GB11. Do not break within emoji modifier sequences or emoji
7098     // zwj sequences. This state applies when the last code point was
7099     // NOT a ZWJ.
7100     GraphemeState.Emoji: (ref state, ch)
7101     {
7102         if (graphemeExtendTrie[ch])
7103             return TransformRes.goOn;
7104 
7105         static assert(!graphemeExtendTrie['\u200D']);
7106 
7107         if (ch == '\u200D')
7108         {
7109             state = GraphemeState.EmojiZWJ;
7110             return TransformRes.goOn;
7111         }
7112 
7113         state = GraphemeState.End;
7114         // There might still be spacing marks are
7115         // at the end, which are not allowed in
7116         // middle of emoji sequences
7117         return TransformRes.redo;
7118     },
7119 
7120     // GB11. Do not break within emoji modifier sequences or emoji
7121     // zwj sequences. This state applies when the last code point was
7122     // a ZWJ.
7123     GraphemeState.EmojiZWJ: (ref state, ch)
7124     {
7125         state = GraphemeState.Emoji;
7126         if (xpictoTrie[ch])
7127             return TransformRes.goOn;
7128         return TransformRes.redo;
7129     },
7130 
7131     // GB9b. Do not break after Prepend characters.
7132     GraphemeState.Prepend: (ref state, ch)
7133     {
7134         // GB5. Break before controls.
7135         if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n')
7136             return TransformRes.retExclude;
7137 
7138         state = GraphemeState.Start;
7139         return TransformRes.redo;
7140     },
7141 
7142     // GB9, GB9a. Do not break before extending characters, ZWJ
7143     // or SpacingMarks.
7144     // GB999. Otherwise, break everywhere.
7145     GraphemeState.End: (ref state, ch)
7146         => !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ?
7147             TransformRes.retExclude :
7148             TransformRes.goOn
7149 ];
7150 
7151 template genericDecodeGrapheme(bool getValue)
7152 {
7153     static if (getValue)
7154         alias Value = Grapheme;
7155     else
7156         alias Value = void;
7157 
7158     Value genericDecodeGrapheme(Input)(ref Input range)
7159     {
7160         static if (getValue)
7161             Grapheme grapheme;
7162         auto state = GraphemeState.Start;
7163         dchar ch;
7164 
7165         assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
7166     outer:
7167         while (!range.empty)
7168         {
7169             ch = range.front;
7170 
7171         rerun:
7172             final switch (graphemeTransforms[state](state, ch))
7173                 with(TransformRes)
7174             {
7175             case goOn:
7176                 static if (getValue)
7177                     grapheme ~= ch;
7178                 range.popFront();
7179                 continue;
7180 
7181             case redo:
7182                 goto rerun;
7183 
7184             case retInclude:
7185                 static if (getValue)
7186                     grapheme ~= ch;
7187                 range.popFront();
7188                 break outer;
7189 
7190             case retExclude:
7191                 break outer;
7192             }
7193         }
7194 
7195         static if (getValue)
7196             return grapheme;
7197     }
7198 }
7199 
7200 public: // Public API continues
7201 
7202 /++
7203     Computes the length of grapheme cluster starting at `index`.
7204     Both the resulting length and the `index` are measured
7205     in $(S_LINK Code unit, code units).
7206 
7207     Params:
7208         C = type that is implicitly convertible to `dchars`
7209         input = array of grapheme clusters
7210         index = starting index into `input[]`
7211 
7212     Returns:
7213         length of grapheme cluster
7214 +/
7215 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure
7216 if (is(C : dchar))
7217 {
7218     auto src = input[index..$];
7219     auto n = src.length;
7220     genericDecodeGrapheme!(false)(src);
7221     return n - src.length;
7222 }
7223 
7224 ///
7225 @safe unittest
7226 {
7227     assert(graphemeStride("  ", 1) == 1);
7228     // A + combing ring above
7229     string city = "A\u030Arhus";
7230     size_t first = graphemeStride(city, 0);
7231     assert(first == 3); //\u030A has 2 UTF-8 code units
7232     assert(city[0 .. first] == "A\u030A");
7233     assert(city[first..$] == "rhus");
7234 }
7235 
7236 @safe unittest
7237 {
7238     // Ensure that graphemeStride is usable from CTFE.
7239     enum c1 = graphemeStride("A", 0);
7240     static assert(c1 == 1);
7241 
7242     enum c2 = graphemeStride("A\u0301", 0);
7243     static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
7244 }
7245 
7246 @safe pure nothrow @nogc unittest
7247 {
7248     // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face
7249     assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2);
7250     // skier ~ female sign ~ '€'
7251     assert(graphemeStride("\u26F7\u2640€"d, 0) == 1);
7252     // skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€'
7253     assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2);
7254     // skier ~ zero-width joiner ~ female sign ~ '€'
7255     assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3);
7256     // skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner
7257     // ~ female sign ~ '€'
7258     assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4);
7259     // skier ~ zero-width joiner ~ '€'
7260     assert(graphemeStride("\u26F7\u200D€"d, 0) == 2);
7261     //'€' ~ zero-width joiner ~ skier
7262     assert(graphemeStride("€\u200D\u26F7"d, 0) == 2);
7263     // Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two
7264     assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2);
7265     // Kaithi number sign ~ null
7266     assert(graphemeStride("\U000110BD\0"d, 0) == 1);
7267 }
7268 
7269 /++
7270     Reads one full grapheme cluster from an
7271     $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`.
7272 
7273     For examples see the $(LREF Grapheme) below.
7274 
7275     Note:
7276     This function modifies `inp` and thus `inp`
7277     must be an L-value.
7278 +/
7279 Grapheme decodeGrapheme(Input)(ref Input inp)
7280 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
7281 {
7282     return genericDecodeGrapheme!true(inp);
7283 }
7284 
7285 @safe unittest
7286 {
7287     import std.algorithm.comparison : equal;
7288 
7289     Grapheme gr;
7290     string s = " \u0020\u0308 ";
7291     gr = decodeGrapheme(s);
7292     assert(gr.length == 1 && gr[0] == ' ');
7293     gr = decodeGrapheme(s);
7294     assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
7295     s = "\u0300\u0308\u1100";
7296     assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
7297     assert(equal(decodeGrapheme(s)[], "\u1100"));
7298     s = "\u11A8\u0308\uAC01";
7299     assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
7300     assert(equal(decodeGrapheme(s)[], "\uAC01"));
7301 
7302     // Two Union Jacks of the Great Britain
7303     s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
7304     assert(equal(decodeGrapheme(s)[], "\U0001F1EC\U0001F1E7"));
7305 }
7306 
7307 /++
7308     $(P Iterate a string by $(LREF Grapheme).)
7309 
7310     $(P Useful for doing string manipulation that needs to be aware
7311     of graphemes.)
7312 
7313     See_Also:
7314         $(LREF byCodePoint)
7315 +/
7316 auto byGrapheme(Range)(Range range)
7317 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7318 {
7319     // TODO: Bidirectional access
7320     static struct Result(R)
7321     {
7322         private R _range;
7323         private Grapheme _front;
7324 
7325         bool empty() @property
7326         {
7327             return _front.length == 0;
7328         }
7329 
7330         Grapheme front() @property
7331         {
7332             return _front;
7333         }
7334 
7335         void popFront()
7336         {
7337             _front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
7338         }
7339 
7340         static if (isForwardRange!R)
7341         {
7342             Result save() @property
7343             {
7344                 return Result(_range.save, _front);
7345             }
7346         }
7347     }
7348 
7349     auto result = Result!(Range)(range);
7350     result.popFront();
7351     return result;
7352 }
7353 
7354 ///
7355 @safe unittest
7356 {
7357     import std.algorithm.comparison : equal;
7358     import std.range.primitives : walkLength;
7359     import std.range : take, drop;
7360     auto text = "noe\u0308l"; // noël using e + combining diaeresis
7361     assert(text.walkLength == 5); // 5 code points
7362 
7363     auto gText = text.byGrapheme;
7364     assert(gText.walkLength == 4); // 4 graphemes
7365 
7366     assert(gText.take(3).equal("noe\u0308".byGrapheme));
7367     assert(gText.drop(3).equal("l".byGrapheme));
7368 }
7369 
7370 // For testing non-forward-range input ranges
7371 version (StdUnittest)
7372 private static @safe struct InputRangeString
7373 {
7374     private string s;
7375 
7376     bool empty() @property { return s.empty; }
7377     dchar front() @property { return s.front; }
7378     void popFront() { s.popFront(); }
7379 }
7380 
7381 @safe unittest
7382 {
7383     import std.algorithm.comparison : equal;
7384     import std.array : array;
7385     import std.range : retro;
7386     import std.range.primitives : walkLength;
7387     assert("".byGrapheme.walkLength == 0);
7388 
7389     auto reverse = "le\u0308on";
7390     assert(reverse.walkLength == 5);
7391 
7392     auto gReverse = reverse.byGrapheme;
7393     assert(gReverse.walkLength == 4);
7394 
7395     static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
7396     {{
7397         assert(text.walkLength == 5);
7398         static assert(isForwardRange!(typeof(text)));
7399 
7400         auto gText = text.byGrapheme;
7401         static assert(isForwardRange!(typeof(gText)));
7402         assert(gText.walkLength == 4);
7403         assert(gText.array.retro.equal(gReverse));
7404     }}
7405 
7406     auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
7407     static assert(!isForwardRange!(typeof(nonForwardRange)));
7408     assert(nonForwardRange.walkLength == 4);
7409 }
7410 
7411 // Issue 23474
7412 @safe pure unittest
7413 {
7414     import std.range.primitives : walkLength;
7415     assert(byGrapheme("\r\u0308").walkLength == 2);
7416 }
7417 
7418 /++
7419     $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
7420 
7421     $(P Useful for converting the result to a string after doing operations
7422     on graphemes.)
7423 
7424     $(P If passed in a range of code points, returns a range with equivalent capabilities.)
7425 +/
7426 auto byCodePoint(Range)(Range range)
7427 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme))
7428 {
7429     // TODO: Propagate bidirectional access
7430     static struct Result
7431     {
7432         private Range _range;
7433         private size_t i = 0;
7434 
7435         bool empty() @property
7436         {
7437             return _range.empty;
7438         }
7439 
7440         dchar front() @property
7441         {
7442             return _range.front[i];
7443         }
7444 
7445         void popFront()
7446         {
7447             ++i;
7448 
7449             if (i >= _range.front.length)
7450             {
7451                 _range.popFront();
7452                 i = 0;
7453             }
7454         }
7455 
7456         static if (isForwardRange!Range)
7457         {
7458             Result save() @property
7459             {
7460                 return Result(_range.save, i);
7461             }
7462         }
7463     }
7464 
7465     return Result(range);
7466 }
7467 
7468 /// Ditto
7469 auto byCodePoint(Range)(Range range)
7470 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7471 {
7472     import std.range.primitives : isBidirectionalRange, popBack;
7473     import std.traits : isNarrowString;
7474     static if (isNarrowString!Range)
7475     {
7476         static struct Result
7477         {
7478             private Range _range;
7479             @property bool empty() { return _range.empty; }
7480             @property dchar front(){ return _range.front; }
7481             void popFront(){ _range.popFront; }
7482             @property auto save() { return Result(_range.save); }
7483             @property dchar back(){ return _range.back; }
7484             void popBack(){ _range.popBack; }
7485         }
7486         static assert(isBidirectionalRange!(Result));
7487         return Result(range);
7488     }
7489     else
7490         return range;
7491 }
7492 
7493 ///
7494 @safe unittest
7495 {
7496     import std.array : array;
7497     import std.conv : text;
7498     import std.range : retro;
7499 
7500     string s = "noe\u0308l"; // noël
7501 
7502     // reverse it and convert the result to a string
7503     string reverse = s.byGrapheme
7504         .array
7505         .retro
7506         .byCodePoint
7507         .text;
7508 
7509     assert(reverse == "le\u0308on"); // lëon
7510 }
7511 
7512 @safe unittest
7513 {
7514     import std.algorithm.comparison : equal;
7515     import std.range.primitives : walkLength;
7516     import std.range : retro;
7517     assert("".byGrapheme.byCodePoint.equal(""));
7518 
7519     string text = "noe\u0308l";
7520     static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length));
7521 
7522     auto gText = InputRangeString(text).byGrapheme;
7523     static assert(!isForwardRange!(typeof(gText)));
7524 
7525     auto cpText = gText.byCodePoint;
7526     static assert(!isForwardRange!(typeof(cpText)));
7527 
7528     assert(cpText.walkLength == text.walkLength);
7529 
7530     auto plainCp = text.byCodePoint;
7531     static assert(isForwardRange!(typeof(plainCp)));
7532     assert(equal(plainCp, text));
7533     assert(equal(retro(plainCp.save), retro(text.save)));
7534     // Check that we still have length for dstring
7535     assert("абвгд"d.byCodePoint.length == 5);
7536 }
7537 
7538 /++
7539     $(P A structure designed to effectively pack $(CHARACTERS)
7540     of a $(CLUSTER).
7541     )
7542 
7543     $(P `Grapheme` has value semantics so 2 copies of a `Grapheme`
7544     always refer to distinct objects. In most actual scenarios a `Grapheme`
7545     fits on the stack and avoids memory allocation overhead for all but quite
7546     long clusters.
7547     )
7548 
7549     See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
7550 +/
7551 @safe struct Grapheme
7552 {
7553     import std.exception : enforce;
7554     import std.traits : isDynamicArray;
7555 
7556 public:
7557     /// Ctor
7558     this(C)(const scope C[] chars...)
7559         if (is(C : dchar))
7560     {
7561         this ~= chars;
7562     }
7563 
7564     ///ditto
7565     this(Input)(Input seq)
7566         if (!isDynamicArray!Input
7567             && isInputRange!Input && is(ElementType!Input : dchar))
7568     {
7569         this ~= seq;
7570     }
7571 
7572     /// Gets a $(CODEPOINT) at the given index in this cluster.
7573     dchar opIndex(size_t index) const @nogc nothrow pure @trusted
7574     {
7575         assert(index < length);
7576         return read24(isBig ? ptr_ : small_.ptr, index);
7577     }
7578 
7579     /++
7580         Writes a $(CODEPOINT) `ch` at given index in this cluster.
7581 
7582         Warning:
7583         Use of this facility may invalidate grapheme cluster,
7584         see also $(LREF Grapheme.valid).
7585     +/
7586     void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted
7587     {
7588         assert(index < length);
7589         write24(isBig ? ptr_ : small_.ptr, ch, index);
7590     }
7591 
7592     ///
7593     @safe unittest
7594     {
7595         auto g = Grapheme("A\u0302");
7596         assert(g[0] == 'A');
7597         assert(g.valid);
7598         g[1] = '~'; // ASCII tilda is not a combining mark
7599         assert(g[1] == '~');
7600         assert(!g.valid);
7601     }
7602 
7603     /++
7604         Random-access range over Grapheme's $(CHARACTERS).
7605 
7606         Warning: Invalidates when this Grapheme leaves the scope,
7607         attempts to use it then would lead to memory corruption.
7608     +/
7609     SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return
7610     {
7611         return sliceOverIndexed(a, b, &this);
7612     }
7613 
7614     /// ditto
7615     SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return
7616     {
7617         return sliceOverIndexed(0, length, &this);
7618     }
7619 
7620     /// Grapheme cluster length in $(CODEPOINTS).
7621     @property size_t length() const @nogc nothrow pure
7622     {
7623         return isBig ? len_ : slen_ & 0x7F;
7624     }
7625 
7626     /++
7627         Append $(CHARACTER) `ch` to this grapheme.
7628         Warning:
7629         Use of this facility may invalidate grapheme cluster,
7630         see also `valid`.
7631 
7632         See_Also: $(LREF Grapheme.valid)
7633     +/
7634     ref opOpAssign(string op)(dchar ch) @trusted
7635     {
7636         static if (op == "~")
7637         {
7638             import std.internal.memory : enforceRealloc;
7639             if (!isBig)
7640             {
7641                 if (slen_ == small_cap)
7642                     convertToBig();// & fallthrough to "big" branch
7643                 else
7644                 {
7645                     write24(small_.ptr, ch, smallLength);
7646                     slen_++;
7647                     return this;
7648                 }
7649             }
7650 
7651             assert(isBig);
7652             if (len_ == cap_)
7653             {
7654                 import core.checkedint : addu, mulu;
7655                 bool overflow;
7656                 cap_ = addu(cap_, grow, overflow);
7657                 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
7658                 if (overflow) assert(0);
7659                 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems);
7660             }
7661             write24(ptr_, ch, len_++);
7662             return this;
7663         }
7664         else
7665             static assert(false, "No operation "~op~" defined for Grapheme");
7666     }
7667 
7668     ///
7669     @safe unittest
7670     {
7671         import std.algorithm.comparison : equal;
7672         auto g = Grapheme("A");
7673         assert(g.valid);
7674         g ~= '\u0301';
7675         assert(g[].equal("A\u0301"));
7676         assert(g.valid);
7677         g ~= "B";
7678         // not a valid grapheme cluster anymore
7679         assert(!g.valid);
7680         // still could be useful though
7681         assert(g[].equal("A\u0301B"));
7682     }
7683 
7684     /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme.
7685     ref opOpAssign(string op, Input)(scope Input inp)
7686         if (isInputRange!Input && is(ElementType!Input : dchar))
7687     {
7688         static if (op == "~")
7689         {
7690             foreach (dchar ch; inp)
7691                 this ~= ch;
7692             return this;
7693         }
7694         else
7695             static assert(false, "No operation "~op~" defined for Grapheme");
7696     }
7697 
7698     // This is not a good `opEquals`, but formerly the automatically generated
7699     // opEquals was used, which was inferred `@safe` because of bugzilla 20655:
7700     // https://issues.dlang.org/show_bug.cgi?id=20655
7701     // This `@trusted opEquals` is only here to prevent breakage.
7702     bool opEquals(R)(const auto ref R other) const @trusted
7703     {
7704         return this.tupleof == other.tupleof;
7705     }
7706 
7707     // Define a default toHash to allow AA usage
7708     size_t toHash() const @trusted
7709     {
7710         return hashOf(slen_, hashOf(small_));
7711     }
7712 
7713     /++
7714         True if this object contains valid extended grapheme cluster.
7715         Decoding primitives of this module always return a valid `Grapheme`.
7716 
7717         Appending to and direct manipulation of grapheme's $(CHARACTERS) may
7718         render it no longer valid. Certain applications may chose to use
7719         Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
7720         entirely.
7721     +/
7722     @property bool valid()() /*const*/
7723     {
7724         auto r = this[];
7725         genericDecodeGrapheme!false(r);
7726         return r.length == 0;
7727     }
7728 
7729     this(this) @nogc nothrow pure @trusted
7730     {
7731         import std.internal.memory : enforceMalloc;
7732         if (isBig)
7733         {// dup it
7734             import core.checkedint : addu, mulu;
7735             bool overflow;
7736             auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
7737             if (overflow) assert(0);
7738 
7739             auto p = cast(ubyte*) enforceMalloc(raw_cap);
7740             p[0 .. raw_cap] = ptr_[0 .. raw_cap];
7741             ptr_ = p;
7742         }
7743     }
7744 
7745     ~this() @nogc nothrow pure @trusted
7746     {
7747         import core.memory : pureFree;
7748         if (isBig)
7749         {
7750             pureFree(ptr_);
7751         }
7752     }
7753 
7754 
7755 private:
7756     enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
7757     // "out of the blue" grow rate, needs testing
7758     // (though graphemes are typically small < 9)
7759     enum grow = 20;
7760     enum small_cap = small_bytes/3;
7761     enum small_flag = 0x80, small_mask = 0x7F;
7762     // 16 bytes in 32bits, should be enough for the majority of cases
7763     union
7764     {
7765         struct
7766         {
7767             ubyte* ptr_;
7768             size_t cap_;
7769             size_t len_;
7770             size_t padding_;
7771         }
7772         struct
7773         {
7774             ubyte[small_bytes] small_;
7775             ubyte slen_;
7776         }
7777     }
7778 
7779     void convertToBig() @nogc nothrow pure @trusted
7780     {
7781         import std.internal.memory : enforceMalloc;
7782         static assert(grow.max / 3 - 1 >= grow);
7783         enum nbytes = 3 * (grow + 1);
7784         size_t k = smallLength;
7785         ubyte* p = cast(ubyte*) enforceMalloc(nbytes);
7786         for (int i=0; i<k; i++)
7787             write24(p, read24(small_.ptr, i), i);
7788         // now we can overwrite small array data
7789         ptr_ = p;
7790         len_ = slen_;
7791         assert(grow > len_);
7792         cap_ = grow;
7793         setBig();
7794     }
7795 
7796     void setBig() @nogc nothrow pure { slen_ |= small_flag; }
7797 
7798     @property size_t smallLength() const @nogc nothrow pure
7799     {
7800         return slen_ & small_mask;
7801     }
7802     @property ubyte isBig() const @nogc nothrow pure
7803     {
7804         return slen_ & small_flag;
7805     }
7806 }
7807 
7808 static assert(Grapheme.sizeof == size_t.sizeof*4);
7809 
7810 
7811 @safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw
7812 {
7813     import std.algorithm.comparison : equal;
7814     Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")];
7815     assert(byGrapheme("ЮУЗ").equal(data[]));
7816 }
7817 
7818 ///
7819 @safe unittest
7820 {
7821     import std.algorithm.comparison : equal;
7822     import std.algorithm.iteration : filter;
7823     import std.range : isRandomAccessRange;
7824 
7825     string bold = "ku\u0308hn";
7826 
7827     // note that decodeGrapheme takes parameter by ref
7828     auto first = decodeGrapheme(bold);
7829 
7830     assert(first.length == 1);
7831     assert(first[0] == 'k');
7832 
7833     // the next grapheme is 2 characters long
7834     auto wideOne = decodeGrapheme(bold);
7835     // slicing a grapheme yields a random-access range of dchar
7836     assert(wideOne[].equal("u\u0308"));
7837     assert(wideOne.length == 2);
7838     static assert(isRandomAccessRange!(typeof(wideOne[])));
7839 
7840     // all of the usual range manipulation is possible
7841     assert(wideOne[].filter!isMark().equal("\u0308"));
7842 
7843     auto g = Grapheme("A");
7844     assert(g.valid);
7845     g ~= '\u0301';
7846     assert(g[].equal("A\u0301"));
7847     assert(g.valid);
7848     g ~= "B";
7849     // not a valid grapheme cluster anymore
7850     assert(!g.valid);
7851     // still could be useful though
7852     assert(g[].equal("A\u0301B"));
7853 }
7854 
7855 @safe unittest
7856 {
7857     auto g = Grapheme("A\u0302");
7858     assert(g[0] == 'A');
7859     assert(g.valid);
7860     g[1] = '~'; // ASCII tilda is not a combining mark
7861     assert(g[1] == '~');
7862     assert(!g.valid);
7863 }
7864 
7865 @safe unittest
7866 {
7867     import std.algorithm.comparison : equal;
7868     import std.algorithm.iteration : map;
7869     import std.conv : text;
7870     import std.range : iota;
7871 
7872     // not valid clusters (but it just a test)
7873     auto g  = Grapheme('a', 'b', 'c', 'd', 'e');
7874     assert(g[0] == 'a');
7875     assert(g[1] == 'b');
7876     assert(g[2] == 'c');
7877     assert(g[3] == 'd');
7878     assert(g[4] == 'e');
7879     g[3] = 'Й';
7880     assert(g[2] == 'c');
7881     assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
7882     assert(g[4] == 'e');
7883     assert(!g.valid);
7884 
7885     g ~= 'ц';
7886     g ~= '~';
7887     assert(g[0] == 'a');
7888     assert(g[1] == 'b');
7889     assert(g[2] == 'c');
7890     assert(g[3] == 'Й');
7891     assert(g[4] == 'e');
7892     assert(g[5] == 'ц');
7893     assert(g[6] == '~');
7894     assert(!g.valid);
7895 
7896     Grapheme copy = g;
7897     copy[0] = 'X';
7898     copy[1] = '-';
7899     assert(g[0] == 'a' && copy[0] == 'X');
7900     assert(g[1] == 'b' && copy[1] == '-');
7901     assert(equal(g[2 .. g.length], copy[2 .. copy.length]));
7902     copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
7903     assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8]));
7904     copy ~= "xyz";
7905     assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15]));
7906     assert(!copy.valid);
7907 
7908     Grapheme h;
7909     foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
7910         h ~= v;
7911     assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1)));
7912 }
7913 
7914 // ensure Grapheme can be used as an AA key.
7915 @safe unittest
7916 {
7917     int[Grapheme] aa;
7918 }
7919 
7920 /++
7921     $(P Does basic case-insensitive comparison of `r1` and `r2`.
7922     This function uses simpler comparison rule thus achieving better performance
7923     than $(LREF icmp). However keep in mind the warning below.)
7924 
7925     Params:
7926         r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7927         r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7928 
7929     Returns:
7930         An `int` that is 0 if the strings match,
7931         &lt;0 if `r1` is lexicographically "less" than `r2`,
7932         &gt;0 if `r1` is lexicographically "greater" than `r2`
7933 
7934     Warning:
7935     This function only handles 1:1 $(CODEPOINT) mapping
7936     and thus is not sufficient for certain alphabets
7937     like German, Greek and few others.
7938 
7939     See_Also:
7940         $(LREF icmp)
7941         $(REF cmp, std,algorithm,comparison)
7942 +/
7943 int sicmp(S1, S2)(scope S1 r1, scope S2 r2)
7944 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1)
7945     && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2))
7946 {
7947     import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file
7948     import std.range.primitives : isInfinite;
7949     import std.utf : decodeFront;
7950     import std.traits : isDynamicArray;
7951     import std.typecons : Yes;
7952     static import std.ascii;
7953 
7954     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
7955         && (isDynamicArray!S2 || isRandomAccessRange!S2)
7956         && !(isInfinite!S1 && isInfinite!S2)
7957         && __traits(compiles,
7958             {
7959                 size_t s = size_t.sizeof / 2;
7960                 r1 = r1[s .. $];
7961                 r2 = r2[s .. $];
7962             }))
7963     {{
7964         // ASCII optimization for dynamic arrays & similar.
7965         size_t i = 0;
7966         static if (isInfinite!S1)
7967             immutable end = r2.length;
7968         else static if (isInfinite!S2)
7969             immutable end = r1.length;
7970         else
7971             immutable end = r1.length > r2.length ? r2.length : r1.length;
7972         for (; i < end; ++i)
7973         {
7974             auto lhs = r1[i];
7975             auto rhs = r2[i];
7976             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
7977             if (lhs == rhs) continue;
7978             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7979             if (lowDiff) return lowDiff;
7980         }
7981         static if (isInfinite!S1)
7982             return 1;
7983         else static if (isInfinite!S2)
7984             return -1;
7985         else
7986             return (r1.length > r2.length) - (r2.length > r1.length);
7987 
7988     NonAsciiPath:
7989         r1 = r1[i .. $];
7990         r2 = r2[i .. $];
7991         // Fall through to standard case.
7992     }}
7993 
7994     while (!r1.empty)
7995     {
7996         immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1);
7997         if (r2.empty)
7998             return 1;
7999         immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2);
8000         int diff = lhs - rhs;
8001         if (!diff)
8002             continue;
8003         if ((lhs | rhs) < 0x80)
8004         {
8005             immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
8006             if (!d) continue;
8007             return d;
8008         }
8009         size_t idx = simpleCaseTrie[lhs];
8010         size_t idx2 = simpleCaseTrie[rhs];
8011         // simpleCaseTrie is packed index table
8012         if (idx != EMPTY_CASE_TRIE)
8013         {
8014             if (idx2 != EMPTY_CASE_TRIE)
8015             {// both cased chars
8016                 // adjust idx --> start of bucket
8017                 idx = idx - sTable(idx).n;
8018                 idx2 = idx2 - sTable(idx2).n;
8019                 if (idx == idx2)// one bucket, equivalent chars
8020                     continue;
8021                 else//  not the same bucket
8022                     diff = sTable(idx).ch - sTable(idx2).ch;
8023             }
8024             else
8025                 diff = sTable(idx - sTable(idx).n).ch - rhs;
8026         }
8027         else if (idx2 != EMPTY_CASE_TRIE)
8028         {
8029             diff = lhs - sTable(idx2 - sTable(idx2).n).ch;
8030         }
8031         // one of chars is not cased at all
8032         return diff;
8033     }
8034     return int(r2.empty) - 1;
8035 }
8036 
8037 ///
8038 @safe @nogc pure nothrow unittest
8039 {
8040     assert(sicmp("Август", "авгусТ") == 0);
8041     // Greek also works as long as there is no 1:M mapping in sight
8042     assert(sicmp("ΌΎ", "όύ") == 0);
8043     // things like the following won't get matched as equal
8044     // Greek small letter iota with dialytika and tonos
8045     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8046 
8047     // while icmp has no problem with that
8048     assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
8049     assert(icmp("ΌΎ", "όύ") == 0);
8050 }
8051 
8052 // overloads for the most common cases to reduce compile time
8053 @safe @nogc pure nothrow
8054 {
8055     int sicmp(scope const(char)[] str1, scope const(char)[] str2)
8056     { return sicmp!(const(char)[], const(char)[])(str1, str2); }
8057 
8058     int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2)
8059     { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8060 
8061     int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2)
8062     { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8063 }
8064 
8065 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
8066 {
8067     import std.algorithm.searching : skipOver;
8068     import std.internal.unicode_tables : fullCaseTable; // generated file
8069     alias fTable = fullCaseTable;
8070     size_t idx = fullCaseTrie[lhs];
8071     // fullCaseTrie is packed index table
8072     if (idx == EMPTY_CASE_TRIE)
8073         return lhs;
8074     immutable start = idx - fTable(idx).n;
8075     immutable end = fTable(idx).size + start;
8076     assert(fTable(start).entry_len == 1);
8077     for (idx=start; idx<end; idx++)
8078     {
8079         const entryLen = fTable(idx).entry_len;
8080         if (entryLen == 1)
8081         {
8082             if (fTable(idx).seq[0] == rhs)
8083             {
8084                 return 0;
8085             }
8086         }
8087         else
8088         {// OK it's a long chunk, like 'ss' for German
8089             dchar[3] arr = fTable(idx).seq;
8090             const dchar[] seq = arr[0 .. entryLen];
8091             if (rhs == seq[0]
8092                 && rtail.skipOver(seq[1..$]))
8093             {
8094                 // note that this path modifies rtail
8095                 // iff we managed to get there
8096                 return 0;
8097             }
8098         }
8099     }
8100     return fTable(start).seq[0]; // new remapped character for accurate diffs
8101 }
8102 
8103 /++
8104     Does case insensitive comparison of `r1` and `r2`.
8105     Follows the rules of full case-folding mapping.
8106     This includes matching as equal german ß with "ss" and
8107     other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
8108     The cost of `icmp` being pedantically correct is
8109     slightly worse performance.
8110 
8111     Params:
8112         r1 = a forward range of characters
8113         r2 = a forward range of characters
8114 
8115     Returns:
8116         An `int` that is 0 if the strings match,
8117         &lt;0 if `str1` is lexicographically "less" than `str2`,
8118         &gt;0 if `str1` is lexicographically "greater" than `str2`
8119 
8120     See_Also:
8121         $(LREF sicmp)
8122         $(REF cmp, std,algorithm,comparison)
8123 +/
8124 int icmp(S1, S2)(S1 r1, S2 r2)
8125 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1)
8126     && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2))
8127 {
8128     import std.range.primitives : isInfinite;
8129     import std.traits : isDynamicArray;
8130     import std.utf : byDchar;
8131     static import std.ascii;
8132 
8133     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
8134         && (isDynamicArray!S2 || isRandomAccessRange!S2)
8135         && !(isInfinite!S1 && isInfinite!S2)
8136         && __traits(compiles,
8137             {
8138                 size_t s = size_t.max / 2;
8139                 r1 = r1[s .. $];
8140                 r2 = r2[s .. $];
8141             }))
8142     {{
8143         // ASCII optimization for dynamic arrays & similar.
8144         size_t i = 0;
8145         static if (isInfinite!S1)
8146             immutable end = r2.length;
8147         else static if (isInfinite!S2)
8148             immutable end = r1.length;
8149         else
8150             immutable end = r1.length > r2.length ? r2.length : r1.length;
8151         for (; i < end; ++i)
8152         {
8153             auto lhs = r1[i];
8154             auto rhs = r2[i];
8155             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
8156             if (lhs == rhs) continue;
8157             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
8158             if (lowDiff) return lowDiff;
8159         }
8160         static if (isInfinite!S1)
8161             return 1;
8162         else static if (isInfinite!S2)
8163             return -1;
8164         else
8165             return (r1.length > r2.length) - (r2.length > r1.length);
8166 
8167     NonAsciiPath:
8168         r1 = r1[i .. $];
8169         r2 = r2[i .. $];
8170         // Fall through to standard case.
8171     }}
8172 
8173     auto str1 = r1.byDchar;
8174     auto str2 = r2.byDchar;
8175 
8176     for (;;)
8177     {
8178         if (str1.empty)
8179             return str2.empty ? 0 : -1;
8180         immutable lhs = str1.front;
8181         if (str2.empty)
8182             return 1;
8183         immutable rhs = str2.front;
8184         str1.popFront();
8185         str2.popFront();
8186         if (!(lhs - rhs))
8187             continue;
8188         // first try to match lhs to <rhs,right-tail> sequence
8189         immutable cmpLR = fullCasedCmp(lhs, rhs, str2);
8190         if (!cmpLR)
8191             continue;
8192         // then rhs to <lhs,left-tail> sequence
8193         immutable cmpRL = fullCasedCmp(rhs, lhs, str1);
8194         if (!cmpRL)
8195             continue;
8196         // cmpXX contain remapped codepoints
8197         // to obtain stable ordering of icmp
8198         return cmpLR - cmpRL;
8199     }
8200 }
8201 
8202 ///
8203 @safe @nogc pure nothrow unittest
8204 {
8205     assert(icmp("Rußland", "Russland") == 0);
8206     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8207 }
8208 
8209 /**
8210  * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding
8211  * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`.
8212  */
8213 @safe @nogc nothrow pure unittest
8214 {
8215     import std.utf : byDchar;
8216 
8217     assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0);
8218     assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0);
8219 }
8220 
8221 // test different character types
8222 @safe unittest
8223 {
8224     assert(icmp("Rußland", "Russland") == 0);
8225     assert(icmp("Rußland"w, "Russland") == 0);
8226     assert(icmp("Rußland", "Russland"w) == 0);
8227     assert(icmp("Rußland"w, "Russland"w) == 0);
8228     assert(icmp("Rußland"d, "Russland"w) == 0);
8229     assert(icmp("Rußland"w, "Russland"d) == 0);
8230 }
8231 
8232 // overloads for the most common cases to reduce compile time
8233 @safe @nogc pure nothrow
8234 {
8235     int icmp(const(char)[] str1, const(char)[] str2)
8236     { return icmp!(const(char)[], const(char)[])(str1, str2); }
8237     int icmp(const(wchar)[] str1, const(wchar)[] str2)
8238     { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8239     int icmp(const(dchar)[] str1, const(dchar)[] str2)
8240     { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8241 }
8242 
8243 @safe unittest
8244 {
8245     import std.algorithm.sorting : sort;
8246     import std.conv : to;
8247     import std.exception : assertCTFEable;
8248     assertCTFEable!(
8249     {
8250     static foreach (cfunc; AliasSeq!(icmp, sicmp))
8251     {{
8252         static foreach (S1; AliasSeq!(string, wstring, dstring))
8253         static foreach (S2; AliasSeq!(string, wstring, dstring))
8254         {
8255             assert(cfunc("".to!S1(), "".to!S2()) == 0);
8256             assert(cfunc("A".to!S1(), "".to!S2()) > 0);
8257             assert(cfunc("".to!S1(), "0".to!S2()) < 0);
8258             assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
8259             assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
8260             assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
8261             assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
8262             assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
8263             // Check example:
8264             assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
8265             assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
8266         }
8267         // check that the order is properly agnostic to the case
8268         auto strs = [ "Apple", "ORANGE",  "orAcle", "amp", "banana"];
8269         sort!((a,b) => cfunc(a,b) < 0)(strs);
8270         assert(strs == ["amp", "Apple",  "banana", "orAcle", "ORANGE"]);
8271     }}
8272     assert(icmp("ßb", "ssa") > 0);
8273     // Check example:
8274     assert(icmp("Russland", "Rußland") == 0);
8275     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8276     assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
8277     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8278     // https://issues.dlang.org/show_bug.cgi?id=11057
8279     assert( icmp("K", "L") < 0 );
8280     });
8281 }
8282 
8283 // https://issues.dlang.org/show_bug.cgi?id=17372
8284 @safe pure unittest
8285 {
8286     import std.algorithm.iteration : joiner, map;
8287     import std.algorithm.sorting : sort;
8288     import std.array : array;
8289     auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0);
8290 }
8291 
8292 // This is package(std) for the moment to be used as a support tool for std.regex
8293 // It needs a better API
8294 /*
8295     Return a range of all $(CODEPOINTS) that casefold to
8296     and from this `ch`.
8297 */
8298 package(std) auto simpleCaseFoldings(dchar ch) @safe
8299 {
8300     import std.internal.unicode_tables : simpleCaseTable; // generated file
8301     alias sTable = simpleCaseTable;
8302     static struct Range
8303     {
8304     @safe pure nothrow:
8305         uint idx; //if == uint.max, then read c.
8306         union
8307         {
8308             dchar c; // == 0 - empty range
8309             uint len;
8310         }
8311         @property bool isSmall() const { return idx == uint.max; }
8312 
8313         this(dchar ch)
8314         {
8315             idx = uint.max;
8316             c = ch;
8317         }
8318 
8319         this(uint start, uint size)
8320         {
8321             idx = start;
8322             len = size;
8323         }
8324 
8325         @property dchar front() const
8326         {
8327             assert(!empty);
8328             if (isSmall)
8329             {
8330                 return c;
8331             }
8332             auto ch = sTable(idx).ch;
8333             return ch;
8334         }
8335 
8336         @property bool empty() const
8337         {
8338             if (isSmall)
8339             {
8340                 return c == 0;
8341             }
8342             return len == 0;
8343         }
8344 
8345         @property size_t length() const
8346         {
8347             if (isSmall)
8348             {
8349                 return c == 0 ? 0 : 1;
8350             }
8351             return len;
8352         }
8353 
8354         void popFront()
8355         {
8356             if (isSmall)
8357                 c = 0;
8358             else
8359             {
8360                 idx++;
8361                 len--;
8362             }
8363         }
8364     }
8365     immutable idx = simpleCaseTrie[ch];
8366     if (idx == EMPTY_CASE_TRIE)
8367         return Range(ch);
8368     auto entry = sTable(idx);
8369     immutable start = idx - entry.n;
8370     return Range(start, entry.size);
8371 }
8372 
8373 @safe unittest
8374 {
8375     import std.algorithm.comparison : equal;
8376     import std.algorithm.searching : canFind;
8377     import std.array : array;
8378     import std.exception : assertCTFEable;
8379     assertCTFEable!((){
8380         auto r = simpleCaseFoldings('Э').array;
8381         assert(r.length == 2);
8382         assert(r.canFind('э') && r.canFind('Э'));
8383         auto sr = simpleCaseFoldings('~');
8384         assert(sr.equal("~"));
8385         //A with ring above - casefolds to the same bucket as Angstrom sign
8386         sr = simpleCaseFoldings('Å');
8387         assert(sr.length == 3);
8388         assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
8389     });
8390 }
8391 
8392 /++
8393     $(P Returns the $(S_LINK Combining class, combining class) of `ch`.)
8394 +/
8395 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc
8396 {
8397     return combiningClassTrie[ch];
8398 }
8399 
8400 ///
8401 @safe unittest
8402 {
8403     // shorten the code
8404     alias CC = combiningClass;
8405 
8406     // combining tilda
8407     assert(CC('\u0303') == 230);
8408     // combining ring below
8409     assert(CC('\u0325') == 220);
8410     // the simple consequence is that  "tilda" should be
8411     // placed after a "ring below" in a sequence
8412 }
8413 
8414 @safe pure nothrow @nogc unittest
8415 {
8416     foreach (ch; 0 .. 0x80)
8417         assert(combiningClass(ch) == 0);
8418     assert(combiningClass('\u05BD') == 22);
8419     assert(combiningClass('\u0300') == 230);
8420     assert(combiningClass('\u0317') == 220);
8421     assert(combiningClass('\u1939') == 222);
8422 }
8423 
8424 /// Unicode character decomposition type.
8425 enum UnicodeDecomposition {
8426     /// Canonical decomposition. The result is canonically equivalent sequence.
8427     Canonical,
8428     /**
8429          Compatibility decomposition. The result is compatibility equivalent sequence.
8430          Note: Compatibility decomposition is a $(B lossy) conversion,
8431          typically suitable only for fuzzy matching and internal processing.
8432     */
8433     Compatibility
8434 }
8435 
8436 /**
8437     Shorthand aliases for character decomposition type, passed as a
8438     template parameter to $(LREF decompose).
8439 */
8440 enum {
8441     Canonical = UnicodeDecomposition.Canonical,
8442     Compatibility = UnicodeDecomposition.Compatibility
8443 }
8444 
8445 /++
8446     Try to canonically compose 2 $(CHARACTERS).
8447     Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
8448 
8449     The assumption is that `first` comes before `second` in the original text,
8450     usually meaning that the first is a starter.
8451 
8452     Note: Hangul syllables are not covered by this function.
8453     See `composeJamo` below.
8454 +/
8455 public dchar compose(dchar first, dchar second) pure nothrow @safe
8456 {
8457     import std.algorithm.iteration : map;
8458     import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask;
8459     import std.range : assumeSorted, stride;
8460     immutable packed = compositionJumpTrie[first];
8461     if (packed == ushort.max)
8462         return dchar.init;
8463     // unpack offset and length
8464     immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
8465     // TODO: optimize this micro binary search (no more then 4-5 steps)
8466     auto r = compositionTable.stride(2)[idx .. idx+cnt].assumeSorted();
8467     immutable target = r.lowerBound(second).length;
8468     if (target == cnt)
8469         return dchar.init;
8470     immutable entry = compositionTable[(idx+target)*2];
8471     if (entry != second)
8472         return dchar.init;
8473     return compositionTable[(idx+target)*2 + 1];
8474 }
8475 
8476 ///
8477 @safe unittest
8478 {
8479     assert(compose('A','\u0308') == '\u00C4');
8480     assert(compose('A', 'B') == dchar.init);
8481     assert(compose('C', '\u0301') == '\u0106');
8482     // note that the starter is the first one
8483     // thus the following doesn't compose
8484     assert(compose('\u0308', 'A') == dchar.init);
8485 }
8486 
8487 /++
8488     Returns a full $(S_LINK Canonical decomposition, Canonical)
8489     (by default) or $(S_LINK Compatibility decomposition, Compatibility)
8490     decomposition of $(CHARACTER) `ch`.
8491     If no decomposition is available returns a $(LREF Grapheme)
8492     with the `ch` itself.
8493 
8494     Note:
8495     This function also decomposes hangul syllables
8496     as prescribed by the standard.
8497 
8498     See_Also: $(LREF decomposeHangul) for a restricted version
8499     that takes into account only hangul syllables  but
8500     no other decompositions.
8501 +/
8502 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe
8503 {
8504     import std.algorithm.searching : until;
8505     import std.internal.unicode_decomp : decompCompatTable, decompCanonTable;
8506     static if (decompType == Canonical)
8507     {
8508         alias table = decompCanonTable;
8509         alias mapping = canonMappingTrie;
8510     }
8511     else static if (decompType == Compatibility)
8512     {
8513         alias table = decompCompatTable;
8514         alias mapping = compatMappingTrie;
8515     }
8516     immutable idx = mapping[ch];
8517     if (!idx) // not found, check hangul arithmetic decomposition
8518         return decomposeHangul(ch);
8519     auto decomp = table[idx..$].until(0);
8520     return Grapheme(decomp);
8521 }
8522 
8523 ///
8524 @safe unittest
8525 {
8526     import std.algorithm.comparison : equal;
8527 
8528     assert(compose('A','\u0308') == '\u00C4');
8529     assert(compose('A', 'B') == dchar.init);
8530     assert(compose('C', '\u0301') == '\u0106');
8531     // note that the starter is the first one
8532     // thus the following doesn't compose
8533     assert(compose('\u0308', 'A') == dchar.init);
8534 
8535     assert(decompose('Ĉ')[].equal("C\u0302"));
8536     assert(decompose('D')[].equal("D"));
8537     assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
8538     assert(decompose!Compatibility('¹')[].equal("1"));
8539 }
8540 
8541 //----------------------------------------------------------------------------
8542 // Hangul specific composition/decomposition
8543 enum jamoSBase = 0xAC00;
8544 enum jamoLBase = 0x1100;
8545 enum jamoVBase = 0x1161;
8546 enum jamoTBase = 0x11A7;
8547 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
8548 enum jamoNCount = jamoVCount * jamoTCount;
8549 enum jamoSCount = jamoLCount * jamoNCount;
8550 
8551 // Tests if `ch` is a Hangul leading consonant jamo.
8552 bool isJamoL(dchar ch) pure nothrow @nogc @safe
8553 {
8554     // first cmp rejects ~ 1M code points above leading jamo range
8555     return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
8556 }
8557 
8558 // Tests if `ch` is a Hangul vowel jamo.
8559 bool isJamoT(dchar ch) pure nothrow @nogc @safe
8560 {
8561     // first cmp rejects ~ 1M code points above trailing jamo range
8562     // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
8563     return ch < jamoTBase+jamoTCount && ch > jamoTBase;
8564 }
8565 
8566 // Tests if `ch` is a Hangul trailnig consonant jamo.
8567 bool isJamoV(dchar ch) pure nothrow @nogc @safe
8568 {
8569     // first cmp rejects ~ 1M code points above vowel range
8570     return  ch < jamoVBase+jamoVCount && ch >= jamoVBase;
8571 }
8572 
8573 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe
8574 {
8575     int idxS = cast(int) ch - jamoSBase;
8576     return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
8577 }
8578 
8579 // internal helper: compose hangul syllables leaving dchar.init in holes
8580 void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe
8581 {
8582     for (size_t idx = 0; idx + 1 < seq.length; )
8583     {
8584         if (isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
8585         {
8586             immutable int indexL = seq[idx] - jamoLBase;
8587             immutable int indexV = seq[idx+1] - jamoVBase;
8588             immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount;
8589             if (idx + 2 < seq.length && isJamoT(seq[idx+2]))
8590             {
8591                 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
8592                 seq[idx+1] = dchar.init;
8593                 seq[idx+2] = dchar.init;
8594                 idx += 3;
8595             }
8596             else
8597             {
8598                 seq[idx] = jamoSBase + indexLV;
8599                 seq[idx+1] = dchar.init;
8600                 idx += 2;
8601             }
8602         }
8603         else
8604             idx++;
8605     }
8606 }
8607 
8608 //----------------------------------------------------------------------------
8609 public:
8610 
8611 /**
8612     Decomposes a Hangul syllable. If `ch` is not a composed syllable
8613     then this function returns $(LREF Grapheme) containing only `ch` as is.
8614 */
8615 Grapheme decomposeHangul(dchar ch) nothrow pure @safe
8616 {
8617     immutable idxS = cast(int) ch - jamoSBase;
8618     if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
8619     immutable idxL = idxS / jamoNCount;
8620     immutable idxV = (idxS % jamoNCount) / jamoTCount;
8621     immutable idxT = idxS % jamoTCount;
8622 
8623     immutable partL = jamoLBase + idxL;
8624     immutable partV = jamoVBase + idxV;
8625     if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
8626         return Grapheme(partL, partV, jamoTBase + idxT);
8627     else // <L, V> decomposition
8628         return Grapheme(partL, partV);
8629 }
8630 
8631 ///
8632 @safe unittest
8633 {
8634     import std.algorithm.comparison : equal;
8635     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8636 }
8637 
8638 /++
8639     Try to compose hangul syllable out of a leading consonant (`lead`),
8640     a `vowel` and optional `trailing` consonant jamos.
8641 
8642     On success returns the composed LV or LVT hangul syllable.
8643 
8644     If any of `lead` and `vowel` are not a valid hangul jamo
8645     of the respective $(CHARACTER) class returns dchar.init.
8646 +/
8647 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe
8648 {
8649     if (!isJamoL(lead))
8650         return dchar.init;
8651     immutable indexL = lead - jamoLBase;
8652     if (!isJamoV(vowel))
8653         return dchar.init;
8654     immutable indexV = vowel - jamoVBase;
8655     immutable indexLV = indexL * jamoNCount + indexV * jamoTCount;
8656     immutable dchar syllable = jamoSBase + indexLV;
8657     return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
8658 }
8659 
8660 ///
8661 @safe unittest
8662 {
8663     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8664     // leaving out T-vowel, or passing any codepoint
8665     // that is not trailing consonant composes an LV-syllable
8666     assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
8667     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8668     assert(composeJamo('\u1111', 'A') == dchar.init);
8669     assert(composeJamo('A', '\u1171') == dchar.init);
8670 }
8671 
8672 @safe unittest
8673 {
8674     import std.algorithm.comparison : equal;
8675     import std.conv : text;
8676 
8677     static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
8678     {
8679         Grapheme g = decompose!T(ch);
8680         assert(equal(g[], r), text(g[], " vs ", r));
8681     }
8682     testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
8683     testDecomp!Canonical('\uF907', "\u9F9C");
8684     testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
8685     testDecomp!Compatibility('\uA7F9', "\u0153");
8686 
8687     // check examples
8688     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8689     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8690     assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
8691     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8692     assert(composeJamo('\u1111', 'A') == dchar.init);
8693     assert(composeJamo('A', '\u1171') == dchar.init);
8694 }
8695 
8696 /**
8697     Enumeration type for normalization forms,
8698     passed as template parameter for functions like $(LREF normalize).
8699 */
8700 enum NormalizationForm {
8701     NFC,
8702     NFD,
8703     NFKC,
8704     NFKD
8705 }
8706 
8707 
8708 enum {
8709     /**
8710         Shorthand aliases from values indicating normalization forms.
8711     */
8712     NFC = NormalizationForm.NFC,
8713     ///ditto
8714     NFD = NormalizationForm.NFD,
8715     ///ditto
8716     NFKC = NormalizationForm.NFKC,
8717     ///ditto
8718     NFKD = NormalizationForm.NFKD
8719 }
8720 
8721 /++
8722     Returns `input` string normalized to the chosen form.
8723     Form C is used by default.
8724 
8725     For more information on normalization forms see
8726     the $(S_LINK Normalization, normalization section).
8727 
8728     Note:
8729     In cases where the string in question is already normalized,
8730     it is returned unmodified and no memory allocation happens.
8731 +/
8732 /*
8733     WARNING: @trusted lambda inside - handle with same care as @trusted
8734         functions
8735 
8736     Despite being a template, the attributes do no harm since this doesn't work
8737     with user-defined range or character types anyway.
8738 */
8739 pure @safe inout(C)[] normalize(NormalizationForm norm=NFC, C)
8740     (return scope inout(C)[] input)
8741 {
8742     import std.algorithm.mutation : SwapStrategy;
8743     import std.algorithm.sorting : sort;
8744     import std.array : appender;
8745     import std.range : zip;
8746 
8747     auto anchors = splitNormalized!norm(input);
8748     if (anchors[0] == input.length && anchors[1] == input.length)
8749         return input;
8750     dchar[] decomposed;
8751     decomposed.reserve(31);
8752     ubyte[] ccc;
8753     ccc.reserve(31);
8754     auto app = appender!(C[])();
8755     do
8756     {
8757         app.put(input[0 .. anchors[0]]);
8758         foreach (dchar ch; input[anchors[0]..anchors[1]])
8759             static if (norm == NFD || norm == NFC)
8760             {
8761                 foreach (dchar c; decompose!Canonical(ch)[])
8762                     decomposed ~= c;
8763             }
8764             else // NFKD & NFKC
8765             {
8766                 foreach (dchar c; decompose!Compatibility(ch)[])
8767                     decomposed ~= c;
8768             }
8769         ccc.length = decomposed.length;
8770         size_t firstNonStable = 0;
8771         ubyte lastClazz = 0;
8772 
8773         foreach (idx, dchar ch; decomposed)
8774         {
8775             immutable clazz = combiningClass(ch);
8776             ccc[idx] = clazz;
8777             if (clazz == 0 && lastClazz != 0)
8778             {
8779                 // found a stable code point after unstable ones
8780                 sort!("a[0] < b[0]", SwapStrategy.stable)
8781                     (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx]));
8782                 firstNonStable = decomposed.length;
8783             }
8784             else if (clazz != 0 && lastClazz == 0)
8785             {
8786                 // found first unstable code point after stable ones
8787                 firstNonStable = idx;
8788             }
8789             lastClazz = clazz;
8790         }
8791         sort!("a[0] < b[0]", SwapStrategy.stable)
8792             (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
8793         static if (norm == NFC || norm == NFKC)
8794         {
8795             import std.algorithm.searching : countUntil;
8796             auto first = countUntil(ccc, 0);
8797             if (first >= 0) // no starters?? no recomposition
8798             {
8799                 for (;;)
8800                 {
8801                     immutable second = recompose(first, decomposed, ccc);
8802                     if (second == decomposed.length)
8803                         break;
8804                     first = second;
8805                 }
8806                 // 2nd pass for hangul syllables
8807                 hangulRecompose(decomposed);
8808             }
8809         }
8810         static if (norm == NFD || norm == NFKD)
8811             app.put(decomposed);
8812         else
8813         {
8814             import std.algorithm.mutation : remove;
8815             auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
8816             app.put(decomposed[0 .. clean.length]);
8817         }
8818         // reset variables
8819         decomposed.length = 0;
8820         () @trusted {
8821             // assumeSafeAppend isn't considered pure as of writing, hence the
8822             // cast. It isn't pure in the sense that the elements after
8823             // the array in question are affected, but we don't use those
8824             // making the call pure for our purposes.
8825             (cast(void delegate() pure nothrow) {decomposed.assumeSafeAppend();})();
8826             ccc.length = 0;
8827             (cast(void delegate() pure nothrow) {ccc.assumeSafeAppend();})();
8828         } ();
8829         input = input[anchors[1]..$];
8830         // and move on
8831         anchors = splitNormalized!norm(input);
8832     } while (anchors[0] != input.length);
8833     app.put(input[0 .. anchors[0]]);
8834     return () @trusted inout { return cast(inout(C)[]) app.data; } ();
8835 }
8836 
8837 ///
8838 @safe pure unittest
8839 {
8840     // any encoding works
8841     wstring greet = "Hello world";
8842     assert(normalize(greet) is greet); // the same exact slice
8843 
8844     // An example of a character with all 4 forms being different:
8845     // Greek upsilon with acute and hook symbol (code point 0x03D3)
8846     assert(normalize!NFC("ϓ") == "\u03D3");
8847     assert(normalize!NFD("ϓ") == "\u03D2\u0301");
8848     assert(normalize!NFKC("ϓ") == "\u038E");
8849     assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
8850 }
8851 
8852 @safe pure unittest
8853 {
8854     import std.conv : text;
8855 
8856     assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
8857     assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
8858     assert(normalize!NFD("Äffin") == "A\u0308ffin");
8859 
8860     // test with dstring
8861     dstring greet = "Hello world";
8862     assert(normalize(greet) is greet); // the same exact slice
8863 }
8864 
8865 // canonically recompose given slice of code points, works in-place and mutates data
8866 private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe
8867 {
8868     assert(input.length == ccc.length);
8869     int accumCC = -1;// so that it's out of 0 .. 255 range
8870     // writefln("recomposing %( %04x %)", input);
8871     // first one is always a starter thus we start at i == 1
8872     size_t i = start+1;
8873     for (; ; )
8874     {
8875         if (i == input.length)
8876             break;
8877         immutable curCC = ccc[i];
8878         // In any character sequence beginning with a starter S
8879         // a character C is blocked from S if and only if there
8880         // is some character B between S and C, and either B
8881         // is a starter or it has the same or higher combining class as C.
8882         //------------------------
8883         // Applying to our case:
8884         // S is input[0]
8885         // accumCC is the maximum CCC of characters between C and S,
8886         //     as ccc are sorted
8887         // C is input[i]
8888 
8889         if (curCC > accumCC)
8890         {
8891             immutable comp = compose(input[start], input[i]);
8892             if (comp != dchar.init)
8893             {
8894                 input[start] = comp;
8895                 input[i] = dchar.init;// put a sentinel
8896                 // current was merged so its CCC shouldn't affect
8897                 // composing with the next one
8898             }
8899             else
8900             {
8901                 // if it was a starter then accumCC is now 0, end of loop
8902                 accumCC = curCC;
8903                 if (accumCC == 0)
8904                     break;
8905             }
8906         }
8907         else
8908         {
8909             // ditto here
8910             accumCC = curCC;
8911             if (accumCC == 0)
8912                 break;
8913         }
8914         i++;
8915     }
8916     return i;
8917 }
8918 
8919 // returns tuple of 2 indexes that delimit:
8920 // normalized text, piece that needs normalization and
8921 // the rest of input starting with stable code point
8922 private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input)
8923 {
8924     import std.typecons : tuple;
8925     ubyte lastCC = 0;
8926 
8927     foreach (idx, dchar ch; input)
8928     {
8929         static if (norm == NFC)
8930             if (ch < 0x0300)
8931             {
8932                 lastCC = 0;
8933                 continue;
8934             }
8935         immutable ubyte CC = combiningClass(ch);
8936         if (lastCC > CC && CC != 0)
8937         {
8938             return seekStable!norm(idx, input);
8939         }
8940 
8941         if (notAllowedIn!norm(ch))
8942         {
8943            return seekStable!norm(idx, input);
8944         }
8945         lastCC = CC;
8946     }
8947     return tuple(input.length, input.length);
8948 }
8949 
8950 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input)
8951 {
8952     import std.typecons : tuple;
8953     import std.utf : codeLength;
8954 
8955     auto br = input[0 .. idx];
8956     size_t region_start = 0;// default
8957     for (;;)
8958     {
8959         if (br.empty)// start is 0
8960             break;
8961         dchar ch = br.back;
8962         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8963         {
8964             region_start = br.length - codeLength!C(ch);
8965             break;
8966         }
8967         br.popFront();
8968     }
8969     ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
8970     size_t region_end=input.length;// end is $ by default
8971     foreach (i, dchar ch; input[idx..$])
8972     {
8973         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8974         {
8975             region_end = i+idx;
8976             break;
8977         }
8978     }
8979     // writeln("Region to normalize: ", input[region_start .. region_end]);
8980     return tuple(region_start, region_end);
8981 }
8982 
8983 /**
8984     Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization
8985     form `norm`.
8986 */
8987 public bool allowedIn(NormalizationForm norm)(dchar ch)
8988 {
8989     return !notAllowedIn!norm(ch);
8990 }
8991 
8992 ///
8993 @safe unittest
8994 {
8995     // e.g. Cyrillic is always allowed, so is ASCII
8996     assert(allowedIn!NFC('я'));
8997     assert(allowedIn!NFD('я'));
8998     assert(allowedIn!NFKC('я'));
8999     assert(allowedIn!NFKD('я'));
9000     assert(allowedIn!NFC('Z'));
9001 }
9002 
9003 // not user friendly name but more direct
9004 private bool notAllowedIn(NormalizationForm norm)(dchar ch)
9005 {
9006     static if (norm == NFC)
9007         alias qcTrie = nfcQCTrie;
9008     else static if (norm == NFD)
9009         alias qcTrie = nfdQCTrie;
9010     else static if (norm == NFKC)
9011         alias qcTrie = nfkcQCTrie;
9012     else static if (norm == NFKD)
9013         alias qcTrie = nfkdQCTrie;
9014     else
9015         static assert("Unknown normalization form "~norm);
9016     return qcTrie[ch];
9017 }
9018 
9019 @safe unittest
9020 {
9021     assert(allowedIn!NFC('я'));
9022     assert(allowedIn!NFD('я'));
9023     assert(allowedIn!NFKC('я'));
9024     assert(allowedIn!NFKD('я'));
9025     assert(allowedIn!NFC('Z'));
9026 }
9027 
9028 }
9029 
9030 version (std_uni_bootstrap)
9031 {
9032     // old version used for bootstrapping of gen_uni.d that generates
9033     // up to date optimal versions of all of isXXX functions
9034     @safe pure nothrow @nogc public bool isWhite(dchar c)
9035     {
9036         import std.ascii : isWhite;
9037         return isWhite(c) ||
9038                c == lineSep || c == paraSep ||
9039                c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
9040                (c >= '\u2000' && c <= '\u200A') ||
9041                c == '\u202F' || c == '\u205F' || c == '\u3000';
9042     }
9043 }
9044 else
9045 {
9046 
9047 // trusted -> avoid bounds check
9048 @trusted pure nothrow @nogc private
9049 {
9050     import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file
9051 
9052     // hide template instances behind functions
9053     // https://issues.dlang.org/show_bug.cgi?id=13232
9054     ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
9055     ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
9056     dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
9057 
9058     ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
9059     ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
9060     dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
9061 
9062     ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
9063     ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
9064     dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
9065 }
9066 
9067 public:
9068 
9069 /++
9070     Whether or not `c` is a Unicode whitespace $(CHARACTER).
9071     (general Unicode category: Part of C0(tab, vertical tab, form feed,
9072     carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
9073 +/
9074 @safe pure nothrow @nogc
9075 public bool isWhite(dchar c)
9076 {
9077     import std.internal.unicode_tables : isWhiteGen; // generated file
9078     return isWhiteGen(c); // call pregenerated binary search
9079 }
9080 
9081 /++
9082     Return whether `c` is a Unicode lowercase $(CHARACTER).
9083 +/
9084 @safe pure nothrow @nogc
9085 bool isLower(dchar c)
9086 {
9087     import std.ascii : isLower, isASCII;
9088     if (isASCII(c))
9089         return isLower(c);
9090     return lowerCaseTrie[c];
9091 }
9092 
9093 @safe unittest
9094 {
9095     import std.ascii : isLower;
9096     foreach (v; 0 .. 0x80)
9097         assert(isLower(v) == .isLower(v));
9098     assert(.isLower('я'));
9099     assert(.isLower('й'));
9100     assert(!.isLower('Ж'));
9101     // Greek HETA
9102     assert(!.isLower('\u0370'));
9103     assert(.isLower('\u0371'));
9104     assert(!.isLower('\u039C')); // capital MU
9105     assert(.isLower('\u03B2')); // beta
9106     // from extended Greek
9107     assert(!.isLower('\u1F18'));
9108     assert(.isLower('\u1F00'));
9109     foreach (v; unicode.lowerCase.byCodepoint)
9110         assert(.isLower(v) && !isUpper(v));
9111 }
9112 
9113 
9114 /++
9115     Return whether `c` is a Unicode uppercase $(CHARACTER).
9116 +/
9117 @safe pure nothrow @nogc
9118 bool isUpper(dchar c)
9119 {
9120     import std.ascii : isUpper, isASCII;
9121     if (isASCII(c))
9122         return isUpper(c);
9123     return upperCaseTrie[c];
9124 }
9125 
9126 @safe unittest
9127 {
9128     import std.ascii : isLower;
9129     foreach (v; 0 .. 0x80)
9130         assert(isLower(v) == .isLower(v));
9131     assert(!isUpper('й'));
9132     assert(isUpper('Ж'));
9133     // Greek HETA
9134     assert(isUpper('\u0370'));
9135     assert(!isUpper('\u0371'));
9136     assert(isUpper('\u039C')); // capital MU
9137     assert(!isUpper('\u03B2')); // beta
9138     // from extended Greek
9139     assert(!isUpper('\u1F00'));
9140     assert(isUpper('\u1F18'));
9141     foreach (v; unicode.upperCase.byCodepoint)
9142         assert(isUpper(v) && !.isLower(v));
9143 }
9144 
9145 
9146 //TODO: Hidden for now, needs better API.
9147 //Other transforms could use better API as well, but this one is a new primitive.
9148 @safe pure nothrow @nogc
9149 private dchar toTitlecase(dchar c)
9150 {
9151     // optimize ASCII case
9152     if (c < 0xAA)
9153     {
9154         if (c < 'a')
9155             return c;
9156         if (c <= 'z')
9157             return c - 32;
9158         return c;
9159     }
9160     size_t idx = toTitleSimpleIndex(c);
9161     if (idx != ushort.max)
9162     {
9163         return toTitleTab(idx);
9164     }
9165     return c;
9166 }
9167 
9168 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
9169 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
9170 
9171 // generic toUpper/toLower on whole string, creates new or returns as is
9172 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s)
9173 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9174 {
9175     import std.array : appender, array;
9176     import std.ascii : isASCII;
9177     import std.utf : byDchar, codeLength;
9178 
9179     alias C = ElementEncodingType!S;
9180 
9181     auto r = s.byDchar;
9182     for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront())
9183     {
9184         auto cOuter = r.front;
9185         ushort idx = indexFn(cOuter);
9186         if (idx == ushort.max)
9187             continue;
9188         auto result = appender!(C[])();
9189         result.reserve(s.length);
9190         result.put(s[0 .. i]);
9191         foreach (dchar c; s[i .. $].byDchar)
9192         {
9193             if (c.isASCII)
9194             {
9195                 result.put(asciiConvert(c));
9196             }
9197             else
9198             {
9199                 idx = indexFn(c);
9200                 if (idx == ushort.max)
9201                     result.put(c);
9202                 else if (idx < maxIdx)
9203                 {
9204                     c = tableFn(idx);
9205                     result.put(c);
9206                 }
9207                 else
9208                 {
9209                     auto val = tableFn(idx);
9210                     // unpack length + codepoint
9211                     immutable uint len = val >> 24;
9212                     result.put(cast(dchar)(val & 0xFF_FFFF));
9213                     foreach (j; idx+1 .. idx+len)
9214                         result.put(tableFn(j));
9215                 }
9216             }
9217         }
9218         return result.data;
9219     }
9220 
9221     static if (isSomeString!S)
9222         return s;
9223     else
9224         return s.array;
9225 }
9226 
9227 // https://issues.dlang.org/show_bug.cgi?id=12428
9228 @safe unittest
9229 {
9230     import std.array : replicate;
9231     auto s = "abcdefghij".replicate(300);
9232     s = s[0 .. 10];
9233 
9234     toUpper(s);
9235 
9236     assert(s == "abcdefghij");
9237 }
9238 
9239 // https://issues.dlang.org/show_bug.cgi?id=18993
9240 @safe unittest
9241 {
9242     static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length);
9243 }
9244 
9245 
9246 // generic toUpper/toLower on whole range, returns range
9247 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str)
9248     // Accept range of dchar's
9249 if (isInputRange!Range &&
9250     isSomeChar!(ElementEncodingType!Range) &&
9251     ElementEncodingType!Range.sizeof == dchar.sizeof)
9252 {
9253     static struct ToCaserImpl
9254     {
9255         @property bool empty()
9256         {
9257             return !nLeft && r.empty;
9258         }
9259 
9260         @property auto front()
9261         {
9262             import std.ascii : isASCII;
9263 
9264             if (!nLeft)
9265             {
9266                 dchar c = r.front;
9267                 if (c.isASCII)
9268                 {
9269                     buf[0] = asciiConvert(c);
9270                     nLeft = 1;
9271                 }
9272                 else
9273                 {
9274                     const idx = indexFn(c);
9275                     if (idx == ushort.max)
9276                     {
9277                         buf[0] = c;
9278                         nLeft = 1;
9279                     }
9280                     else if (idx < maxIdx)
9281                     {
9282                         buf[0] = tableFn(idx);
9283                         nLeft = 1;
9284                     }
9285                     else
9286                     {
9287                         immutable val = tableFn(idx);
9288                         // unpack length + codepoint
9289                         nLeft = val >> 24;
9290                         if (nLeft == 0)
9291                             nLeft = 1;
9292                         assert(nLeft <= buf.length);
9293                         buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9294                         foreach (j; 1 .. nLeft)
9295                             buf[nLeft - j - 1] = tableFn(idx + j);
9296                     }
9297                 }
9298             }
9299             return buf[nLeft - 1];
9300         }
9301 
9302         void popFront()
9303         {
9304             if (!nLeft)
9305                 front;
9306             assert(nLeft);
9307             --nLeft;
9308             if (!nLeft)
9309                 r.popFront();
9310         }
9311 
9312         static if (isForwardRange!Range)
9313         {
9314             @property auto save()
9315             {
9316                 auto ret = this;
9317                 ret.r = r.save;
9318                 return ret;
9319             }
9320         }
9321 
9322       private:
9323         Range r;
9324         uint nLeft;
9325         dchar[3] buf = void;
9326     }
9327 
9328     return ToCaserImpl(str);
9329 }
9330 
9331 /*********************
9332  * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9333  * or a string to upper or lower case.
9334  *
9335  * Does not allocate memory.
9336  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9337  * are treated as $(REF replacementDchar, std,utf).
9338  *
9339  * Params:
9340  *      str = string or range of characters
9341  *
9342  * Returns:
9343  *      an input range of `dchar`s
9344  *
9345  * See_Also:
9346  *      $(LREF toUpper), $(LREF toLower)
9347  */
9348 
9349 auto asLowerCase(Range)(Range str)
9350 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9351     !isConvertibleToString!Range)
9352 {
9353     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9354     {
9355         import std.utf : byDchar;
9356 
9357         // Decode first
9358         return asLowerCase(str.byDchar);
9359     }
9360     else
9361     {
9362         static import std.ascii;
9363         return toCaser!(LowerTriple, std.ascii.toLower)(str);
9364     }
9365 }
9366 
9367 /// ditto
9368 auto asUpperCase(Range)(Range str)
9369 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9370     !isConvertibleToString!Range)
9371 {
9372     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9373     {
9374         import std.utf : byDchar;
9375 
9376         // Decode first
9377         return asUpperCase(str.byDchar);
9378     }
9379     else
9380     {
9381         static import std.ascii;
9382         return toCaser!(UpperTriple, std.ascii.toUpper)(str);
9383     }
9384 }
9385 
9386 ///
9387 @safe pure unittest
9388 {
9389     import std.algorithm.comparison : equal;
9390 
9391     assert("hEllo".asUpperCase.equal("HELLO"));
9392 }
9393 
9394 // explicitly undocumented
9395 auto asLowerCase(Range)(auto ref Range str)
9396 if (isConvertibleToString!Range)
9397 {
9398     import std.traits : StringTypeOf;
9399     return asLowerCase!(StringTypeOf!Range)(str);
9400 }
9401 
9402 // explicitly undocumented
9403 auto asUpperCase(Range)(auto ref Range str)
9404 if (isConvertibleToString!Range)
9405 {
9406     import std.traits : StringTypeOf;
9407     return asUpperCase!(StringTypeOf!Range)(str);
9408 }
9409 
9410 @safe unittest
9411 {
9412     static struct TestAliasedString
9413     {
9414         string get() @safe @nogc pure nothrow { return _s; }
9415         alias get this;
9416         @disable this(this);
9417         string _s;
9418     }
9419 
9420     static bool testAliasedString(alias func, Args...)(string s, Args args)
9421     {
9422         import std.algorithm.comparison : equal;
9423         auto a = func(TestAliasedString(s), args);
9424         auto b = func(s, args);
9425         static if (is(typeof(equal(a, b))))
9426         {
9427             // For ranges, compare contents instead of object identity.
9428             return equal(a, b);
9429         }
9430         else
9431         {
9432             return a == b;
9433         }
9434     }
9435     assert(testAliasedString!asLowerCase("hEllo"));
9436     assert(testAliasedString!asUpperCase("hEllo"));
9437     assert(testAliasedString!asCapitalized("hEllo"));
9438 }
9439 
9440 @safe unittest
9441 {
9442     import std.array : array;
9443 
9444     auto a = "HELLo".asLowerCase;
9445     auto savea = a.save;
9446     auto s = a.array;
9447     assert(s == "hello");
9448     s = savea.array;
9449     assert(s == "hello");
9450 
9451     string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
9452     string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
9453 
9454     foreach (i, slwr; lower)
9455     {
9456         import std.utf : byChar;
9457 
9458         auto sx = slwr.asUpperCase.byChar.array;
9459         assert(sx == toUpper(slwr));
9460         auto sy = upper[i].asLowerCase.byChar.array;
9461         assert(sy == toLower(upper[i]));
9462     }
9463 
9464     // Not necessary to call r.front
9465     for (auto r = lower[3].asUpperCase; !r.empty; r.popFront())
9466     {
9467     }
9468 
9469     import std.algorithm.comparison : equal;
9470 
9471     "HELLo"w.asLowerCase.equal("hello"d);
9472     "HELLo"w.asUpperCase.equal("HELLO"d);
9473     "HELLo"d.asLowerCase.equal("hello"d);
9474     "HELLo"d.asUpperCase.equal("HELLO"d);
9475 
9476     import std.utf : byChar;
9477     assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array);
9478 }
9479 
9480 // generic capitalizer on whole range, returns range
9481 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper,
9482                            Range)(Range str)
9483     // Accept range of dchar's
9484 if (isInputRange!Range &&
9485     isSomeChar!(ElementEncodingType!Range) &&
9486     ElementEncodingType!Range.sizeof == dchar.sizeof)
9487 {
9488     static struct ToCapitalizerImpl
9489     {
9490         @property bool empty()
9491         {
9492             return lower ? lwr.empty : !nLeft && r.empty;
9493         }
9494 
9495         @property auto front()
9496         {
9497             if (lower)
9498                 return lwr.front;
9499 
9500             if (!nLeft)
9501             {
9502                 immutable dchar c = r.front;
9503                 const idx = indexFnUpper(c);
9504                 if (idx == ushort.max)
9505                 {
9506                     buf[0] = c;
9507                     nLeft = 1;
9508                 }
9509                 else if (idx < maxIdxUpper)
9510                 {
9511                     buf[0] = tableFnUpper(idx);
9512                     nLeft = 1;
9513                 }
9514                 else
9515                 {
9516                     immutable val = tableFnUpper(idx);
9517                     // unpack length + codepoint
9518                     nLeft = val >> 24;
9519                     if (nLeft == 0)
9520                         nLeft = 1;
9521                     assert(nLeft <= buf.length);
9522                     buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9523                     foreach (j; 1 .. nLeft)
9524                         buf[nLeft - j - 1] = tableFnUpper(idx + j);
9525                 }
9526             }
9527             return buf[nLeft - 1];
9528         }
9529 
9530         void popFront()
9531         {
9532             if (lower)
9533                 lwr.popFront();
9534             else
9535             {
9536                 if (!nLeft)
9537                     front;
9538                 assert(nLeft);
9539                 --nLeft;
9540                 if (!nLeft)
9541                 {
9542                     r.popFront();
9543                     lwr = r.asLowerCase();
9544                     lower = true;
9545                 }
9546             }
9547         }
9548 
9549         static if (isForwardRange!Range)
9550         {
9551             @property auto save()
9552             {
9553                 auto ret = this;
9554                 ret.r = r.save;
9555                 ret.lwr = lwr.save;
9556                 return ret;
9557             }
9558         }
9559 
9560       private:
9561         Range r;
9562         typeof(r.asLowerCase) lwr; // range representing the lower case rest of string
9563         bool lower = false;     // false for first character, true for rest of string
9564         dchar[3] buf = void;
9565         uint nLeft = 0;
9566     }
9567 
9568     return ToCapitalizerImpl(str);
9569 }
9570 
9571 /*********************
9572  * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9573  * or string, meaning convert the first
9574  * character to upper case and subsequent characters to lower case.
9575  *
9576  * Does not allocate memory.
9577  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9578  * are treated as $(REF replacementDchar, std,utf).
9579  *
9580  * Params:
9581  *      str = string or range of characters
9582  *
9583  * Returns:
9584  *      an InputRange of dchars
9585  *
9586  * See_Also:
9587  *      $(LREF toUpper), $(LREF toLower)
9588  *      $(LREF asUpperCase), $(LREF asLowerCase)
9589  */
9590 
9591 auto asCapitalized(Range)(Range str)
9592 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9593     !isConvertibleToString!Range)
9594 {
9595     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9596     {
9597         import std.utf : byDchar;
9598 
9599         // Decode first
9600         return toCapitalizer!UpperTriple(str.byDchar);
9601     }
9602     else
9603     {
9604         return toCapitalizer!UpperTriple(str);
9605     }
9606 }
9607 
9608 ///
9609 @safe pure unittest
9610 {
9611     import std.algorithm.comparison : equal;
9612 
9613     assert("hEllo".asCapitalized.equal("Hello"));
9614 }
9615 
9616 auto asCapitalized(Range)(auto ref Range str)
9617 if (isConvertibleToString!Range)
9618 {
9619     import std.traits : StringTypeOf;
9620     return asCapitalized!(StringTypeOf!Range)(str);
9621 }
9622 
9623 @safe pure nothrow @nogc unittest
9624 {
9625     auto r = "hEllo".asCapitalized();
9626     assert(r.front == 'H');
9627 }
9628 
9629 @safe unittest
9630 {
9631     import std.array : array;
9632 
9633     auto a = "hELLo".asCapitalized;
9634     auto savea = a.save;
9635     auto s = a.array;
9636     assert(s == "Hello");
9637     s = savea.array;
9638     assert(s == "Hello");
9639 
9640     string[2][] cases =
9641     [
9642         ["", ""],
9643         ["h", "H"],
9644         ["H", "H"],
9645         ["3", "3"],
9646         ["123", "123"],
9647         ["h123A", "H123a"],
9648         ["феж", "Феж"],
9649         ["\u1Fe2", "\u03a5\u0308\u0300"],
9650     ];
9651 
9652     foreach (i; 0 .. cases.length)
9653     {
9654         import std.utf : byChar;
9655 
9656         auto r = cases[i][0].asCapitalized.byChar.array;
9657         auto result = cases[i][1];
9658         assert(r == result);
9659     }
9660 
9661     // Don't call r.front
9662     for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront())
9663     {
9664     }
9665 
9666     import std.algorithm.comparison : equal;
9667 
9668     "HELLo"w.asCapitalized.equal("Hello"d);
9669     "hElLO"w.asCapitalized.equal("Hello"d);
9670     "hello"d.asCapitalized.equal("Hello"d);
9671     "HELLO"d.asCapitalized.equal("Hello"d);
9672 
9673     import std.utf : byChar;
9674     assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array);
9675 }
9676 
9677 // TODO: helper, I wish std.utf was more flexible (and stright)
9678 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9679 {
9680     if (c <= 0x7F)
9681     {
9682         buf[idx] = cast(char) c;
9683         idx++;
9684     }
9685     else if (c <= 0x7FF)
9686     {
9687         buf[idx] = cast(char)(0xC0 | (c >> 6));
9688         buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
9689         idx += 2;
9690     }
9691     else if (c <= 0xFFFF)
9692     {
9693         buf[idx] = cast(char)(0xE0 | (c >> 12));
9694         buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9695         buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
9696         idx += 3;
9697     }
9698     else if (c <= 0x10FFFF)
9699     {
9700         buf[idx] = cast(char)(0xF0 | (c >> 18));
9701         buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
9702         buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9703         buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
9704         idx += 4;
9705     }
9706     else
9707         assert(0);
9708     return idx;
9709 }
9710 
9711 @safe unittest
9712 {
9713     char[] s = "abcd".dup;
9714     size_t i = 0;
9715     i = encodeTo(s, i, 'X');
9716     assert(s == "Xbcd");
9717 
9718     i = encodeTo(s, i, cast(dchar)'\u00A9');
9719     assert(s == "X\xC2\xA9d");
9720 }
9721 
9722 // TODO: helper, I wish std.utf was more flexible (and stright)
9723 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure
9724 {
9725     import std.utf : UTFException;
9726     if (c <= 0xFFFF)
9727     {
9728         if (0xD800 <= c && c <= 0xDFFF)
9729             throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
9730         buf[idx] = cast(wchar) c;
9731         idx++;
9732     }
9733     else if (c <= 0x10FFFF)
9734     {
9735         buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
9736         buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
9737         idx += 2;
9738     }
9739     else
9740         assert(0);
9741     return idx;
9742 }
9743 
9744 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9745 {
9746     buf[idx] = c;
9747     idx++;
9748     return idx;
9749 }
9750 
9751 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
9752 if (is(C == char) || is(C == wchar)  || is(C == dchar))
9753 {
9754     import std.utf : decode, codeLength;
9755     size_t curIdx = 0;
9756     size_t destIdx = 0;
9757     alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
9758     size_t lastUnchanged = 0;
9759     // in-buffer move of bytes to a new start index
9760     // the trick is that it may not need to copy at all
9761     static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
9762     {
9763         // Interestingly we may just bump pointer for a while
9764         // then have to copy if a re-cased char was smaller the original
9765         // later we may regain pace with char that got bigger
9766         // In the end it sometimes flip-flops between the 2 cases below
9767         if (dest == from)
9768             return to;
9769         // got to copy
9770         foreach (C c; str[from .. to])
9771             str[dest++] = c;
9772         return dest;
9773     }
9774     while (curIdx != s.length)
9775     {
9776         size_t startIdx = curIdx;
9777         immutable ch = decode(s, curIdx);
9778         // TODO: special case for ASCII
9779         immutable caseIndex = indexFn(ch);
9780         if (caseIndex == ushort.max) // unchanged, skip over
9781         {
9782             continue;
9783         }
9784         else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9785         {
9786             // previous cased chars had the same length as uncased ones
9787             // thus can just adjust pointer
9788             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9789             lastUnchanged = curIdx;
9790             immutable cased = tableFn(caseIndex);
9791             immutable casedLen = codeLength!C(cased);
9792             if (casedLen + destIdx > curIdx) // no place to fit cased char
9793             {
9794                 // switch to slow codepath, where we allocate
9795                 return slowToCase(s, startIdx, destIdx);
9796             }
9797             else
9798             {
9799                 destIdx = encodeTo(s, destIdx, cased);
9800             }
9801         }
9802         else  // 1:m codepoint mapping, slow codepath
9803         {
9804             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9805             lastUnchanged = curIdx;
9806             return slowToCase(s, startIdx, destIdx);
9807         }
9808         assert(destIdx <= curIdx);
9809     }
9810     if (lastUnchanged != s.length)
9811     {
9812         destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
9813     }
9814     s = s[0 .. destIdx];
9815 }
9816 
9817 // helper to precalculate size of case-converted string
9818 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
9819 {
9820     size_t toCaseLength(C)(const scope C[] str)
9821     {
9822         import std.utf : decode, codeLength;
9823         size_t codeLen = 0;
9824         size_t lastNonTrivial = 0;
9825         size_t curIdx = 0;
9826         while (curIdx != str.length)
9827         {
9828             immutable startIdx = curIdx;
9829             immutable ch = decode(str, curIdx);
9830             immutable ushort caseIndex = indexFn(ch);
9831             if (caseIndex == ushort.max)
9832                 continue;
9833             else if (caseIndex < maxIdx)
9834             {
9835                 codeLen += startIdx - lastNonTrivial;
9836                 lastNonTrivial = curIdx;
9837                 immutable cased = tableFn(caseIndex);
9838                 codeLen += codeLength!C(cased);
9839             }
9840             else
9841             {
9842                 codeLen += startIdx - lastNonTrivial;
9843                 lastNonTrivial = curIdx;
9844                 immutable val = tableFn(caseIndex);
9845                 immutable len = val >> 24;
9846                 immutable dchar cased = val & 0xFF_FFFF;
9847                 codeLen += codeLength!C(cased);
9848                 foreach (j; caseIndex+1 .. caseIndex+len)
9849                     codeLen += codeLength!C(tableFn(j));
9850             }
9851         }
9852         if (lastNonTrivial != str.length)
9853             codeLen += str.length - lastNonTrivial;
9854         return codeLen;
9855     }
9856 }
9857 
9858 @safe unittest
9859 {
9860     alias toLowerLength = toCaseLength!(LowerTriple);
9861     assert(toLowerLength("abcd") == 4);
9862     assert(toLowerLength("аБВгд456") == 10+3);
9863 }
9864 
9865 // slower code path that preallocates and then copies
9866 // case-converted stuf to the new string
9867 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
9868 {
9869     void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
9870         size_t destIdx) @trusted pure
9871         if (is(C == char) || is(C == wchar) || is(C == dchar))
9872     {
9873         import std.utf : decode;
9874         alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
9875         auto trueLength = destIdx + caseLength(s[curIdx..$]);
9876         C[] ns = new C[trueLength];
9877         ns[0 .. destIdx] = s[0 .. destIdx];
9878         size_t lastUnchanged = curIdx;
9879         while (curIdx != s.length)
9880         {
9881             immutable startIdx = curIdx; // start of current codepoint
9882             immutable ch = decode(s, curIdx);
9883             immutable caseIndex = indexFn(ch);
9884             if (caseIndex == ushort.max) // skip over
9885             {
9886                 continue;
9887             }
9888             else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9889             {
9890                 immutable cased = tableFn(caseIndex);
9891                 auto toCopy = startIdx - lastUnchanged;
9892                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9893                 lastUnchanged = curIdx;
9894                 destIdx += toCopy;
9895                 destIdx = encodeTo(ns, destIdx, cased);
9896             }
9897             else  // 1:m codepoint mapping, slow codepath
9898             {
9899                 auto toCopy = startIdx - lastUnchanged;
9900                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9901                 lastUnchanged = curIdx;
9902                 destIdx += toCopy;
9903                 auto val = tableFn(caseIndex);
9904                 // unpack length + codepoint
9905                 immutable uint len = val >> 24;
9906                 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
9907                 foreach (j; caseIndex+1 .. caseIndex+len)
9908                     destIdx = encodeTo(ns, destIdx, tableFn(j));
9909             }
9910         }
9911         if (lastUnchanged != s.length)
9912         {
9913             auto toCopy = s.length - lastUnchanged;
9914             ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$];
9915             destIdx += toCopy;
9916         }
9917         assert(ns.length == destIdx);
9918         s = ns;
9919     }
9920 }
9921 
9922 /++
9923     Converts `s` to lowercase (by performing Unicode lowercase mapping) in place.
9924     For a few characters string length may increase after the transformation,
9925     in such a case the function reallocates exactly once.
9926     If `s` does not have any uppercase characters, then `s` is unaltered.
9927 +/
9928 void toLowerInPlace(C)(ref C[] s) @trusted pure
9929 if (is(C == char) || is(C == wchar) || is(C == dchar))
9930 {
9931     toCaseInPlace!(LowerTriple)(s);
9932 }
9933 // overloads for the most common cases to reduce compile time
9934 @safe pure /*TODO nothrow*/
9935 {
9936     void toLowerInPlace(ref char[] s)
9937     { toLowerInPlace!char(s); }
9938     void toLowerInPlace(ref wchar[] s)
9939     { toLowerInPlace!wchar(s); }
9940     void toLowerInPlace(ref dchar[] s)
9941     { toLowerInPlace!dchar(s); }
9942 }
9943 
9944 /++
9945     Converts `s` to uppercase  (by performing Unicode uppercase mapping) in place.
9946     For a few characters string length may increase after the transformation,
9947     in such a case the function reallocates exactly once.
9948     If `s` does not have any lowercase characters, then `s` is unaltered.
9949 +/
9950 void toUpperInPlace(C)(ref C[] s) @trusted pure
9951 if (is(C == char) || is(C == wchar) || is(C == dchar))
9952 {
9953     toCaseInPlace!(UpperTriple)(s);
9954 }
9955 // overloads for the most common cases to reduce compile time/code size
9956 @safe pure /*TODO nothrow*/
9957 {
9958     void toUpperInPlace(ref char[] s)
9959     { toUpperInPlace!char(s); }
9960     void toUpperInPlace(ref wchar[] s)
9961     { toUpperInPlace!wchar(s); }
9962     void toUpperInPlace(ref dchar[] s)
9963     { toUpperInPlace!dchar(s); }
9964 }
9965 
9966 /++
9967     If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
9968     is returned. Otherwise `c` is returned.
9969 
9970     Warning: certain alphabets like German and Greek have no 1:1
9971     upper-lower mapping. Use overload of toLower which takes full string instead.
9972 +/
9973 @safe pure nothrow @nogc
9974 dchar toLower(dchar c)
9975 {
9976      // optimize ASCII case
9977     if (c < 0xAA)
9978     {
9979         if (c < 'A')
9980             return c;
9981         if (c <= 'Z')
9982             return c + 32;
9983         return c;
9984     }
9985     size_t idx = toLowerSimpleIndex(c);
9986     if (idx != ushort.max)
9987     {
9988         return toLowerTab(idx);
9989     }
9990     return c;
9991 }
9992 
9993 /++
9994     Creates a new array which is identical to `s` except that all of its
9995     characters are converted to lowercase (by performing Unicode lowercase mapping).
9996     If none of `s` characters were affected, then `s` itself is returned if `s` is a
9997     `string`-like type.
9998 
9999     Params:
10000         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10001         of characters
10002     Returns:
10003         An array with the same element type as `s`.
10004 +/
10005 ElementEncodingType!S[] toLower(S)(return scope S s) @trusted
10006 if (isSomeString!S)
10007 {
10008     static import std.ascii;
10009     return toCase!(LowerTriple, std.ascii.toLower)(s);
10010 }
10011 
10012 /// ditto
10013 ElementEncodingType!S[] toLower(S)(S s)
10014 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10015 {
10016     static import std.ascii;
10017     return toCase!(LowerTriple, std.ascii.toLower)(s);
10018 }
10019 
10020 // overloads for the most common cases to reduce compile time
10021 @safe pure /*TODO nothrow*/
10022 {
10023     string toLower(return scope string s)
10024     { return toLower!string(s); }
10025     wstring toLower(return scope wstring s)
10026     { return toLower!wstring(s); }
10027     dstring toLower(return scope dstring s)
10028     { return toLower!dstring(s); }
10029 
10030     @safe unittest
10031     {
10032         // https://issues.dlang.org/show_bug.cgi?id=16663
10033 
10034         static struct String
10035         {
10036             string data;
10037             alias data this;
10038         }
10039 
10040         void foo()
10041         {
10042             auto u = toLower(String(""));
10043         }
10044     }
10045 }
10046 
10047 
10048 @safe unittest
10049 {
10050     static import std.ascii;
10051     import std.format : format;
10052     foreach (ch; 0 .. 0x80)
10053         assert(std.ascii.toLower(ch) == toLower(ch));
10054     assert(toLower('Я') == 'я');
10055     assert(toLower('Δ') == 'δ');
10056     foreach (ch; unicode.upperCase.byCodepoint)
10057     {
10058         dchar low = ch.toLower();
10059         assert(low == ch || isLower(low), format("%s -> %s", ch, low));
10060     }
10061     assert(toLower("АЯ") == "ая");
10062 
10063     assert("\u1E9E".toLower == "\u00df");
10064     assert("\u00df".toUpper == "SS");
10065 }
10066 
10067 // https://issues.dlang.org/show_bug.cgi?id=9629
10068 @safe unittest
10069 {
10070     wchar[] test = "hello þ world"w.dup;
10071     auto piece = test[6 .. 7];
10072     toUpperInPlace(piece);
10073     assert(test == "hello Þ world");
10074 }
10075 
10076 
10077 @safe unittest
10078 {
10079     import std.algorithm.comparison : cmp;
10080     string s1 = "FoL";
10081     string s2 = toLower(s1);
10082     assert(cmp(s2, "fol") == 0, s2);
10083     assert(s2 != s1);
10084 
10085     char[] s3 = s1.dup;
10086     toLowerInPlace(s3);
10087     assert(s3 == s2);
10088 
10089     s1 = "A\u0100B\u0101d";
10090     s2 = toLower(s1);
10091     s3 = s1.dup;
10092     assert(cmp(s2, "a\u0101b\u0101d") == 0);
10093     assert(s2 !is s1);
10094     toLowerInPlace(s3);
10095     assert(s3 == s2);
10096 
10097     s1 = "A\u0460B\u0461d";
10098     s2 = toLower(s1);
10099     s3 = s1.dup;
10100     assert(cmp(s2, "a\u0461b\u0461d") == 0);
10101     assert(s2 !is s1);
10102     toLowerInPlace(s3);
10103     assert(s3 == s2);
10104 
10105     s1 = "\u0130";
10106     s2 = toLower(s1);
10107     s3 = s1.dup;
10108     assert(s2 == "i\u0307");
10109     assert(s2 !is s1);
10110     toLowerInPlace(s3);
10111     assert(s3 == s2);
10112 
10113     // Test on wchar and dchar strings.
10114     assert(toLower("Some String"w) == "some string"w);
10115     assert(toLower("Some String"d) == "some string"d);
10116 
10117     // https://issues.dlang.org/show_bug.cgi?id=12455
10118     dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
10119     assert(isUpper(c));
10120     assert(toLower(c) == 'i');
10121     // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report
10122     // check simple-case toUpper too
10123     c = '\u1f87';
10124     assert(isLower(c));
10125     assert(toUpper(c) == '\u1F8F');
10126 }
10127 
10128 @safe pure unittest
10129 {
10130     import std.algorithm.comparison : cmp, equal;
10131     import std.utf : byCodeUnit;
10132     auto r1 = "FoL".byCodeUnit;
10133     assert(r1.toLower.cmp("fol") == 0);
10134     auto r2 = "A\u0460B\u0461d".byCodeUnit;
10135     assert(r2.toLower.cmp("a\u0461b\u0461d") == 0);
10136 }
10137 
10138 /++
10139     If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
10140     is returned. Otherwise `c` is returned.
10141 
10142     Warning:
10143     Certain alphabets like German and Greek have no 1:1
10144     upper-lower mapping. Use overload of toUpper which takes full string instead.
10145 
10146     toUpper can be used as an argument to $(REF map, std,algorithm,iteration)
10147     to produce an algorithm that can convert a range of characters to upper case
10148     without allocating memory.
10149     A string can then be produced by using $(REF copy, std,algorithm,mutation)
10150     to send it to an $(REF appender, std,array).
10151 +/
10152 @safe pure nothrow @nogc
10153 dchar toUpper(dchar c)
10154 {
10155     // optimize ASCII case
10156     if (c < 0xAA)
10157     {
10158         if (c < 'a')
10159             return c;
10160         if (c <= 'z')
10161             return c - 32;
10162         return c;
10163     }
10164     size_t idx = toUpperSimpleIndex(c);
10165     if (idx != ushort.max)
10166     {
10167         return toUpperTab(idx);
10168     }
10169     return c;
10170 }
10171 
10172 ///
10173 @safe unittest
10174 {
10175     import std.algorithm.iteration : map;
10176     import std.algorithm.mutation : copy;
10177     import std.array : appender;
10178 
10179     auto abuf = appender!(char[])();
10180     "hello".map!toUpper.copy(abuf);
10181     assert(abuf.data == "HELLO");
10182 }
10183 
10184 @safe unittest
10185 {
10186     static import std.ascii;
10187     import std.format : format;
10188     foreach (ch; 0 .. 0x80)
10189         assert(std.ascii.toUpper(ch) == toUpper(ch));
10190     assert(toUpper('я') == 'Я');
10191     assert(toUpper('δ') == 'Δ');
10192     auto title = unicode.Titlecase_Letter;
10193     foreach (ch; unicode.lowerCase.byCodepoint)
10194     {
10195         dchar up = ch.toUpper();
10196         assert(up == ch || isUpper(up) || title[up],
10197             format("%x -> %x", ch, up));
10198     }
10199 }
10200 
10201 /++
10202     Allocates a new array which is identical to `s` except that all of its
10203     characters are converted to uppercase (by performing Unicode uppercase mapping).
10204     If none of `s` characters were affected, then `s` itself is returned if `s`
10205     is a `string`-like type.
10206 
10207     Params:
10208         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10209         of characters
10210     Returns:
10211         An new array with the same element type as `s`.
10212 +/
10213 ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted
10214 if (isSomeString!S)
10215 {
10216     static import std.ascii;
10217     return toCase!(UpperTriple, std.ascii.toUpper)(s);
10218 }
10219 
10220 /// ditto
10221 ElementEncodingType!S[] toUpper(S)(S s)
10222 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10223 {
10224     static import std.ascii;
10225     return toCase!(UpperTriple, std.ascii.toUpper)(s);
10226 }
10227 
10228 // overloads for the most common cases to reduce compile time
10229 @safe pure /*TODO nothrow*/
10230 {
10231     string toUpper(return scope string s)
10232     { return toUpper!string(s); }
10233     wstring toUpper(return scope wstring s)
10234     { return toUpper!wstring(s); }
10235     dstring toUpper(return scope dstring s)
10236     { return toUpper!dstring(s); }
10237 
10238     @safe unittest
10239     {
10240         // https://issues.dlang.org/show_bug.cgi?id=16663
10241 
10242         static struct String
10243         {
10244             string data;
10245             alias data this;
10246         }
10247 
10248         void foo()
10249         {
10250             auto u = toUpper(String(""));
10251         }
10252     }
10253 }
10254 
10255 @safe unittest
10256 {
10257     import std.algorithm.comparison : cmp;
10258 
10259     string s1 = "FoL";
10260     string s2;
10261     char[] s3;
10262 
10263     s2 = toUpper(s1);
10264     s3 = s1.dup; toUpperInPlace(s3);
10265     assert(s3 == s2, s3);
10266     assert(cmp(s2, "FOL") == 0);
10267     assert(s2 !is s1);
10268 
10269     s1 = "a\u0100B\u0101d";
10270     s2 = toUpper(s1);
10271     s3 = s1.dup; toUpperInPlace(s3);
10272     assert(s3 == s2);
10273     assert(cmp(s2, "A\u0100B\u0100D") == 0);
10274     assert(s2 !is s1);
10275 
10276     s1 = "a\u0460B\u0461d";
10277     s2 = toUpper(s1);
10278     s3 = s1.dup; toUpperInPlace(s3);
10279     assert(s3 == s2);
10280     assert(cmp(s2, "A\u0460B\u0460D") == 0);
10281     assert(s2 !is s1);
10282 }
10283 
10284 @safe unittest
10285 {
10286     static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
10287     {
10288         import std.format : format;
10289         string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
10290         auto low = s.toLower() , up = s.toUpper();
10291         auto lowInp = s.dup, upInp = s.dup;
10292         lowInp.toLowerInPlace();
10293         upInp.toUpperInPlace();
10294         assert(low == trueLow, format(diff, low, trueLow));
10295         assert(up == trueUp,  format(diff, up, trueUp));
10296         assert(lowInp == trueLow,
10297             format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow));
10298         assert(upInp == trueUp,
10299             format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp));
10300     }
10301     static foreach (S; AliasSeq!(dstring, wstring, string))
10302     {{
10303 
10304         S easy = "123";
10305         S good = "abCФеж";
10306         S awful = "\u0131\u023f\u2126";
10307         S wicked = "\u0130\u1FE2";
10308         auto options = [easy, good, awful, wicked];
10309         S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
10310         S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
10311 
10312         foreach (val; [easy, good])
10313         {
10314             auto e = val.dup;
10315             auto g = e;
10316             e.toUpperInPlace();
10317             assert(e is g);
10318             e.toLowerInPlace();
10319             assert(e is g);
10320         }
10321         foreach (i, v; options)
10322         {
10323             doTest(v, upper[i], lower[i]);
10324         }
10325 
10326         // a few combinatorial runs
10327         foreach (i; 0 .. options.length)
10328         foreach (j; i .. options.length)
10329         foreach (k; j .. options.length)
10330         {
10331             auto sample = options[i] ~ options[j] ~ options[k];
10332             auto sample2 = options[k] ~ options[j] ~ options[i];
10333             doTest(sample, upper[i] ~ upper[j] ~ upper[k],
10334                 lower[i] ~ lower[j] ~ lower[k]);
10335             doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
10336                 lower[k] ~ lower[j] ~ lower[i]);
10337         }
10338     }}
10339 }
10340 
10341 // test random access ranges
10342 @safe pure unittest
10343 {
10344     import std.algorithm.comparison : cmp;
10345     import std.utf : byCodeUnit;
10346     auto s1 = "FoL".byCodeUnit;
10347     assert(s1.toUpper.cmp("FOL") == 0);
10348     auto s2 = "a\u0460B\u0461d".byCodeUnit;
10349     assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0);
10350 }
10351 
10352 /++
10353     Returns whether `c` is a Unicode alphabetic $(CHARACTER)
10354     (general Unicode category: Alphabetic).
10355 +/
10356 @safe pure nothrow @nogc
10357 bool isAlpha(dchar c)
10358 {
10359     // optimization
10360     if (c < 0xAA)
10361     {
10362         return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
10363     }
10364 
10365     return alphaTrie[c];
10366 }
10367 
10368 @safe unittest
10369 {
10370     auto alpha = unicode("Alphabetic");
10371     foreach (ch; alpha.byCodepoint)
10372         assert(isAlpha(ch));
10373     foreach (ch; 0 .. 0x4000)
10374         assert((ch in alpha) == isAlpha(ch));
10375 }
10376 
10377 
10378 /++
10379     Returns whether `c` is a Unicode mark
10380     (general Unicode category: Mn, Me, Mc).
10381 +/
10382 @safe pure nothrow @nogc
10383 bool isMark(dchar c)
10384 {
10385     return markTrie[c];
10386 }
10387 
10388 @safe unittest
10389 {
10390     auto mark = unicode("Mark");
10391     foreach (ch; mark.byCodepoint)
10392         assert(isMark(ch));
10393     foreach (ch; 0 .. 0x4000)
10394         assert((ch in mark) == isMark(ch));
10395 }
10396 
10397 /++
10398     Returns whether `c` is a Unicode numerical $(CHARACTER)
10399     (general Unicode category: Nd, Nl, No).
10400 +/
10401 @safe pure nothrow @nogc
10402 bool isNumber(dchar c)
10403 {
10404     // optimization for ascii case
10405     if (c <= 0x7F)
10406     {
10407         return c >= '0' && c <= '9';
10408     }
10409     else
10410     {
10411         return numberTrie[c];
10412     }
10413 }
10414 
10415 @safe unittest
10416 {
10417     auto n = unicode("N");
10418     foreach (ch; n.byCodepoint)
10419         assert(isNumber(ch));
10420     foreach (ch; 0 .. 0x4000)
10421         assert((ch in n) == isNumber(ch));
10422 }
10423 
10424 /++
10425     Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number.
10426     (general Unicode category: Alphabetic, Nd, Nl, No).
10427 
10428     Params:
10429         c = any Unicode character
10430     Returns:
10431         `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode
10432         categories
10433 +/
10434 @safe pure nothrow @nogc
10435 bool isAlphaNum(dchar c)
10436 {
10437     static import std.ascii;
10438 
10439     // optimization for ascii case
10440     if (std.ascii.isASCII(c))
10441     {
10442         return std.ascii.isAlphaNum(c);
10443     }
10444     else
10445     {
10446         return isAlpha(c) || isNumber(c);
10447     }
10448 }
10449 
10450 @safe unittest
10451 {
10452     auto n = unicode("N");
10453     auto alpha = unicode("Alphabetic");
10454 
10455     foreach (ch; n.byCodepoint)
10456         assert(isAlphaNum(ch));
10457 
10458     foreach (ch; alpha.byCodepoint)
10459         assert(isAlphaNum(ch));
10460 
10461     foreach (ch; 0 .. 0x4000)
10462     {
10463         assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch));
10464     }
10465 }
10466 
10467 /++
10468     Returns whether `c` is a Unicode punctuation $(CHARACTER)
10469     (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
10470 +/
10471 @safe pure nothrow @nogc
10472 bool isPunctuation(dchar c)
10473 {
10474     static import std.ascii;
10475 
10476     // optimization for ascii case
10477     if (c <= 0x7F)
10478     {
10479         return std.ascii.isPunctuation(c);
10480     }
10481     else
10482     {
10483         return punctuationTrie[c];
10484     }
10485 }
10486 
10487 @safe unittest
10488 {
10489     assert(isPunctuation('\u0021'));
10490     assert(isPunctuation('\u0028'));
10491     assert(isPunctuation('\u0029'));
10492     assert(isPunctuation('\u002D'));
10493     assert(isPunctuation('\u005F'));
10494     assert(isPunctuation('\u00AB'));
10495     assert(isPunctuation('\u00BB'));
10496     foreach (ch; unicode("P").byCodepoint)
10497         assert(isPunctuation(ch));
10498 }
10499 
10500 /++
10501     Returns whether `c` is a Unicode symbol $(CHARACTER)
10502     (general Unicode category: Sm, Sc, Sk, So).
10503 +/
10504 @safe pure nothrow @nogc
10505 bool isSymbol(dchar c)
10506 {
10507    return symbolTrie[c];
10508 }
10509 
10510 @safe unittest
10511 {
10512     import std.format : format;
10513     assert(isSymbol('\u0024'));
10514     assert(isSymbol('\u002B'));
10515     assert(isSymbol('\u005E'));
10516     assert(isSymbol('\u00A6'));
10517     foreach (ch; unicode("S").byCodepoint)
10518         assert(isSymbol(ch), format("%04x", ch));
10519 }
10520 
10521 /++
10522     Returns whether `c` is a Unicode space $(CHARACTER)
10523     (general Unicode category: Zs)
10524     Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
10525     For commonly used less strict semantics see $(LREF isWhite).
10526 +/
10527 @safe pure nothrow @nogc
10528 bool isSpace(dchar c)
10529 {
10530     import std.internal.unicode_tables : isSpaceGen; // generated file
10531     return isSpaceGen(c);
10532 }
10533 
10534 @safe unittest
10535 {
10536     assert(isSpace('\u0020'));
10537     auto space = unicode.Zs;
10538     foreach (ch; space.byCodepoint)
10539         assert(isSpace(ch));
10540     foreach (ch; 0 .. 0x1000)
10541         assert(isSpace(ch) == space[ch]);
10542 }
10543 
10544 
10545 /++
10546     Returns whether `c` is a Unicode graphical $(CHARACTER)
10547     (general Unicode category: L, M, N, P, S, Zs).
10548 
10549 +/
10550 @safe pure nothrow @nogc
10551 bool isGraphical(dchar c)
10552 {
10553     return graphicalTrie[c];
10554 }
10555 
10556 
10557 @safe unittest
10558 {
10559     auto set = unicode("Graphical");
10560     import std.format : format;
10561     foreach (ch; set.byCodepoint)
10562         assert(isGraphical(ch), format("%4x", ch));
10563     foreach (ch; 0 .. 0x4000)
10564         assert((ch in set) == isGraphical(ch));
10565 }
10566 
10567 
10568 /++
10569     Returns whether `c` is a Unicode control $(CHARACTER)
10570     (general Unicode category: Cc).
10571 +/
10572 @safe pure nothrow @nogc
10573 bool isControl(dchar c)
10574 {
10575     import std.internal.unicode_tables : isControlGen; // generated file
10576     return isControlGen(c);
10577 }
10578 
10579 @safe unittest
10580 {
10581     assert(isControl('\u0000'));
10582     assert(isControl('\u0081'));
10583     assert(!isControl('\u0100'));
10584     auto cc = unicode.Cc;
10585     foreach (ch; cc.byCodepoint)
10586         assert(isControl(ch));
10587     foreach (ch; 0 .. 0x1000)
10588         assert(isControl(ch) == cc[ch]);
10589 }
10590 
10591 
10592 /++
10593     Returns whether `c` is a Unicode formatting $(CHARACTER)
10594     (general Unicode category: Cf).
10595 +/
10596 @safe pure nothrow @nogc
10597 bool isFormat(dchar c)
10598 {
10599     import std.internal.unicode_tables : isFormatGen; // generated file
10600     return isFormatGen(c);
10601 }
10602 
10603 
10604 @safe unittest
10605 {
10606     assert(isFormat('\u00AD'));
10607     foreach (ch; unicode("Format").byCodepoint)
10608         assert(isFormat(ch));
10609 }
10610 
10611 // code points for private use, surrogates are not likely to change in near feature
10612 // if need be they can be generated from unicode data as well
10613 
10614 /++
10615     Returns whether `c` is a Unicode Private Use $(CODEPOINT)
10616     (general Unicode category: Co).
10617 +/
10618 @safe pure nothrow @nogc
10619 bool isPrivateUse(dchar c)
10620 {
10621     return (0x00_E000 <= c && c <= 0x00_F8FF)
10622         || (0x0F_0000 <= c && c <= 0x0F_FFFD)
10623         || (0x10_0000 <= c && c <= 0x10_FFFD);
10624 }
10625 
10626 /++
10627     Returns whether `c` is a Unicode surrogate $(CODEPOINT)
10628     (general Unicode category: Cs).
10629 +/
10630 @safe pure nothrow @nogc
10631 bool isSurrogate(dchar c)
10632 {
10633     return (0xD800 <= c && c <= 0xDFFF);
10634 }
10635 
10636 /++
10637     Returns whether `c` is a Unicode high surrogate (lead surrogate).
10638 +/
10639 @safe pure nothrow @nogc
10640 bool isSurrogateHi(dchar c)
10641 {
10642     return (0xD800 <= c && c <= 0xDBFF);
10643 }
10644 
10645 /++
10646     Returns whether `c` is a Unicode low surrogate (trail surrogate).
10647 +/
10648 @safe pure nothrow @nogc
10649 bool isSurrogateLo(dchar c)
10650 {
10651     return (0xDC00 <= c && c <= 0xDFFF);
10652 }
10653 
10654 /++
10655     Returns whether `c` is a Unicode non-character i.e.
10656     a $(CODEPOINT) with no assigned abstract character.
10657     (general Unicode category: Cn)
10658 +/
10659 @safe pure nothrow @nogc
10660 bool isNonCharacter(dchar c)
10661 {
10662     return nonCharacterTrie[c];
10663 }
10664 
10665 @safe unittest
10666 {
10667     auto set = unicode("Cn");
10668     foreach (ch; set.byCodepoint)
10669         assert(isNonCharacter(ch));
10670 }
10671 
10672 private:
10673 // load static data from pre-generated tables into usable datastructures
10674 
10675 
10676 @safe auto asSet(const (ubyte)[] compressed) pure
10677 {
10678     return CodepointSet.fromIntervals(decompressIntervals(compressed));
10679 }
10680 
10681 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e)
10682 {
10683     return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
10684 }
10685 
10686 @safe pure nothrow @nogc @property
10687 {
10688     // It's important to use auto return here, so that the compiler
10689     // only runs semantic on the return type if the function gets
10690     // used. Also these are functions rather than templates to not
10691     // increase the object size of the caller.
10692     auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
10693     auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
10694     auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
10695     auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
10696     auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
10697     auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
10698     auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
10699     auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
10700     auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
10701     auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
10702     auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
10703 
10704     //normalization quick-check tables
10705     auto nfcQCTrie()
10706     {
10707         import std.internal.unicode_norm : nfcQCTrieEntries;
10708         static immutable res = asTrie(nfcQCTrieEntries);
10709         return res;
10710     }
10711 
10712     auto nfdQCTrie()
10713     {
10714         import std.internal.unicode_norm : nfdQCTrieEntries;
10715         static immutable res = asTrie(nfdQCTrieEntries);
10716         return res;
10717     }
10718 
10719     auto nfkcQCTrie()
10720     {
10721         import std.internal.unicode_norm : nfkcQCTrieEntries;
10722         static immutable res = asTrie(nfkcQCTrieEntries);
10723         return res;
10724     }
10725 
10726     auto nfkdQCTrie()
10727     {
10728         import std.internal.unicode_norm : nfkdQCTrieEntries;
10729         static immutable res = asTrie(nfkdQCTrieEntries);
10730         return res;
10731     }
10732 
10733     //grapheme breaking algorithm tables
10734     auto spacingMarkTrie()
10735     {
10736         import std.internal.unicode_grapheme : spacingMarkTrieEntries;
10737         static immutable res = asTrie(spacingMarkTrieEntries);
10738         return res;
10739     }
10740 
10741     auto graphemeExtendTrie()
10742     {
10743         import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
10744         static immutable res = asTrie(graphemeExtendTrieEntries);
10745         return res;
10746     }
10747 
10748     auto hangLV()
10749     {
10750         import std.internal.unicode_grapheme : hangulLVTrieEntries;
10751         static immutable res = asTrie(hangulLVTrieEntries);
10752         return res;
10753     }
10754 
10755     auto hangLVT()
10756     {
10757         import std.internal.unicode_grapheme : hangulLVTTrieEntries;
10758         static immutable res = asTrie(hangulLVTTrieEntries);
10759         return res;
10760     }
10761 
10762     auto prependTrie()
10763     {
10764         import std.internal.unicode_grapheme : prependTrieEntries;
10765         static immutable res = asTrie(prependTrieEntries);
10766         return res;
10767     }
10768 
10769     auto graphemeControlTrie()
10770     {
10771         import std.internal.unicode_grapheme : controlTrieEntries;
10772         static immutable res = asTrie(controlTrieEntries);
10773         return res;
10774     }
10775 
10776     auto xpictoTrie()
10777     {
10778         import std.internal.unicode_grapheme : Extended_PictographicTrieEntries;
10779         static immutable res = asTrie(Extended_PictographicTrieEntries);
10780         return res;
10781     }
10782 
10783     // tables below are used for composition/decomposition
10784     auto combiningClassTrie()
10785     {
10786         import std.internal.unicode_comp : combiningClassTrieEntries;
10787         static immutable res = asTrie(combiningClassTrieEntries);
10788         return res;
10789     }
10790 
10791     auto compatMappingTrie()
10792     {
10793         import std.internal.unicode_decomp : compatMappingTrieEntries;
10794         static immutable res = asTrie(compatMappingTrieEntries);
10795         return res;
10796     }
10797 
10798     auto canonMappingTrie()
10799     {
10800         import std.internal.unicode_decomp : canonMappingTrieEntries;
10801         static immutable res = asTrie(canonMappingTrieEntries);
10802         return res;
10803     }
10804 
10805     auto compositionJumpTrie()
10806     {
10807         import std.internal.unicode_comp : compositionJumpTrieEntries;
10808         static immutable res = asTrie(compositionJumpTrieEntries);
10809         return res;
10810     }
10811 
10812     //case conversion tables
10813     auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
10814     auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
10815     auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
10816     //simple case conversion tables
10817     auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
10818     auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
10819     auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
10820 
10821 }
10822 
10823 }// version (!std_uni_bootstrap)