1 // Written in the D programming language.
2 
3 /++
4     $(P The `std.uni` module provides an implementation
5     of fundamental Unicode algorithms and data structures.
6     This doesn't include UTF encoding and decoding primitives,
7     see $(REF decode, std,_utf) and $(REF encode, std,_utf) in $(MREF std, utf)
8     for this functionality. )
9 
10 $(SCRIPT inhibitQuickIndex = 1;)
11 $(DIVC quickindex,
12 $(BOOKTABLE,
13 $(TR $(TH Category) $(TH Functions))
14 $(TR $(TD Decode) $(TD
15     $(LREF byCodePoint)
16     $(LREF byGrapheme)
17     $(LREF decodeGrapheme)
18     $(LREF graphemeStride)
19 ))
20 $(TR $(TD Comparison) $(TD
21     $(LREF icmp)
22     $(LREF sicmp)
23 ))
24 $(TR $(TD Classification) $(TD
25     $(LREF isAlpha)
26     $(LREF isAlphaNum)
27     $(LREF isCodepointSet)
28     $(LREF isControl)
29     $(LREF isFormat)
30     $(LREF isGraphical)
31     $(LREF isIntegralPair)
32     $(LREF isMark)
33     $(LREF isNonCharacter)
34     $(LREF isNumber)
35     $(LREF isPrivateUse)
36     $(LREF isPunctuation)
37     $(LREF isSpace)
38     $(LREF isSurrogate)
39     $(LREF isSurrogateHi)
40     $(LREF isSurrogateLo)
41     $(LREF isSymbol)
42     $(LREF isWhite)
43 ))
44 $(TR $(TD Normalization) $(TD
45     $(LREF NFC)
46     $(LREF NFD)
47     $(LREF NFKD)
48     $(LREF NormalizationForm)
49     $(LREF normalize)
50 ))
51 $(TR $(TD Decompose) $(TD
52     $(LREF decompose)
53     $(LREF decomposeHangul)
54     $(LREF UnicodeDecomposition)
55 ))
56 $(TR $(TD Compose) $(TD
57     $(LREF compose)
58     $(LREF composeJamo)
59 ))
60 $(TR $(TD Sets) $(TD
61     $(LREF CodepointInterval)
62     $(LREF CodepointSet)
63     $(LREF InversionList)
64     $(LREF unicode)
65 ))
66 $(TR $(TD Trie) $(TD
67     $(LREF codepointSetTrie)
68     $(LREF CodepointSetTrie)
69     $(LREF codepointTrie)
70     $(LREF CodepointTrie)
71     $(LREF toTrie)
72     $(LREF toDelegate)
73 ))
74 $(TR $(TD Casing) $(TD
75     $(LREF asCapitalized)
76     $(LREF asLowerCase)
77     $(LREF asUpperCase)
78     $(LREF isLower)
79     $(LREF isUpper)
80     $(LREF toLower)
81     $(LREF toLowerInPlace)
82     $(LREF toUpper)
83     $(LREF toUpperInPlace)
84 ))
85 $(TR $(TD Utf8Matcher) $(TD
86     $(LREF isUtfMatcher)
87     $(LREF MatcherConcept)
88     $(LREF utfMatcher)
89 ))
90 $(TR $(TD Separators) $(TD
91     $(LREF lineSep)
92     $(LREF nelSep)
93     $(LREF paraSep)
94 ))
95 $(TR $(TD Building blocks) $(TD
96     $(LREF allowedIn)
97     $(LREF combiningClass)
98     $(LREF Grapheme)
99 ))
100 ))
101 
102     $(P All primitives listed operate on Unicode characters and
103         sets of characters. For functions which operate on ASCII characters
104         and ignore Unicode $(CHARACTERS), see $(MREF std, ascii).
105         For definitions of Unicode $(CHARACTER), $(CODEPOINT) and other terms
106         used throughout this module see the $(S_LINK Terminology, terminology) section
107         below.
108     )
109     $(P The focus of this module is the core needs of developing Unicode-aware
110         applications. To that effect it provides the following optimized primitives:
111     )
112     $(UL
113         $(LI Character classification by category and common properties:
114             $(LREF isAlpha), $(LREF isWhite) and others.
115         )
116         $(LI
117             Case-insensitive string comparison ($(LREF sicmp), $(LREF icmp)).
118         )
119         $(LI
120             Converting text to any of the four normalization forms via $(LREF normalize).
121         )
122         $(LI
123             Decoding ($(LREF decodeGrapheme))  and iteration ($(LREF byGrapheme), $(LREF graphemeStride))
124             by user-perceived characters, that is by $(LREF Grapheme) clusters.
125         )
126         $(LI
127             Decomposing and composing of individual character(s) according to canonical
128             or compatibility rules, see $(LREF compose) and $(LREF decompose),
129             including the specific version for Hangul syllables $(LREF composeJamo)
130             and $(LREF decomposeHangul).
131         )
132     )
133     $(P It's recognized that an application may need further enhancements
134         and extensions, such as less commonly known algorithms,
135         or tailoring existing ones for region specific needs. To help users
136         with building any extra functionality beyond the core primitives,
137         the module provides:
138     )
139     $(UL
140         $(LI
141             $(LREF CodepointSet), a type for easy manipulation of sets of characters.
142             Besides the typical set algebra it provides an unusual feature:
143             a D source code generator for detection of $(CODEPOINTS) in this set.
144             This is a boon for meta-programming parser frameworks,
145             and is used internally to power classification in small
146             sets like $(LREF isWhite).
147         )
148         $(LI
149             A way to construct optimal packed multi-stage tables also known as a
150             special case of $(LINK2 https://en.wikipedia.org/wiki/Trie, Trie).
151             The functions $(LREF codepointTrie), $(LREF codepointSetTrie)
152             construct custom tries that map dchar to value.
153             The end result is a fast and predictable $(BIGOH 1) lookup that powers
154             functions like $(LREF isAlpha) and $(LREF combiningClass),
155             but for user-defined data sets.
156         )
157         $(LI
158             A useful technique for Unicode-aware parsers that perform
159             character classification of encoded $(CODEPOINTS)
160             is to avoid unnecassary decoding at all costs.
161             $(LREF utfMatcher) provides an improvement over the usual workflow
162             of decode-classify-process, combining the decoding and classification
163             steps. By extracting necessary bits directly from encoded
164             $(S_LINK Code unit, code units) matchers achieve
165             significant performance improvements. See $(LREF MatcherConcept) for
166             the common interface of UTF matchers.
167         )
168         $(LI
169             Generally useful building blocks for customized normalization:
170             $(LREF combiningClass) for querying combining class
171             and $(LREF allowedIn) for testing the Quick_Check
172             property of a given normalization form.
173         )
174         $(LI
175             Access to a large selection of commonly used sets of $(CODEPOINTS).
176             $(S_LINK Unicode properties, Supported sets) include Script,
177             Block and General Category. The exact contents of a set can be
178             observed in the CLDR utility, on the
179             $(HTTP www.unicode.org/cldr/utility/properties.jsp, property index) page
180             of the Unicode website.
181             See $(LREF unicode) for easy and (optionally) compile-time checked set
182             queries.
183         )
184     )
185     $(SECTION Synopsis)
186     ---
187     import std.uni;
188     void main()
189     {
190         // initialize code point sets using script/block or property name
191         // now 'set' contains code points from both scripts.
192         auto set = unicode("Cyrillic") | unicode("Armenian");
193         // same thing but simpler and checked at compile-time
194         auto ascii = unicode.ASCII;
195         auto currency = unicode.Currency_Symbol;
196 
197         // easy set ops
198         auto a = set & ascii;
199         assert(a.empty); // as it has no intersection with ascii
200         a = set | ascii;
201         auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
202 
203         // some properties of code point sets
204         assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
205         // testing presence of a code point in a set
206         // is just fine, it is O(logN)
207         assert(!b['$']);
208         assert(!b['\u058F']); // Armenian dram sign
209         assert(b['¥']);
210 
211         // building fast lookup tables, these guarantee O(1) complexity
212         // 1-level Trie lookup table essentially a huge bit-set ~262Kb
213         auto oneTrie = toTrie!1(b);
214         // 2-level far more compact but typically slightly slower
215         auto twoTrie = toTrie!2(b);
216         // 3-level even smaller, and a bit slower yet
217         auto threeTrie = toTrie!3(b);
218         assert(oneTrie['£']);
219         assert(twoTrie['£']);
220         assert(threeTrie['£']);
221 
222         // build the trie with the most sensible trie level
223         // and bind it as a functor
224         auto cyrillicOrArmenian = toDelegate(set);
225         auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
226         assert(balance == "ընկեր!");
227         // compatible with bool delegate(dchar)
228         bool delegate(dchar) bindIt = cyrillicOrArmenian;
229 
230         // Normalization
231         string s = "Plain ascii (and not only), is always normalized!";
232         assert(s is normalize(s));// is the same string
233 
234         string nonS = "A\u0308ffin"; // A ligature
235         auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
236         assert(nS == "Äffin");
237         assert(nS != nonS);
238         string composed = "Äffin";
239 
240         assert(normalize!NFD(composed) == "A\u0308ffin");
241         // to NFKD, compatibility decomposition useful for fuzzy matching/searching
242         assert(normalize!NFKD("2¹⁰") == "210");
243     }
244     ---
245     $(SECTION Terminology)
246     $(P The following is a list of important Unicode notions
247     and definitions. Any conventions used specifically in this
248     module alone are marked as such. The descriptions are based on the formal
249     definition as found in $(HTTP www.unicode.org/versions/Unicode6.2.0/ch03.pdf,
250     chapter three of The Unicode Standard Core Specification.)
251     )
252     $(P $(DEF Abstract character) A unit of information used for the organization,
253         control, or representation of textual data.
254         Note that:
255         $(UL
256             $(LI When representing data, the nature of that data
257                 is generally symbolic as opposed to some other
258                 kind of data (for example, visual).
259             )
260              $(LI An abstract character has no concrete form
261                 and should not be confused with a $(S_LINK Glyph, glyph).
262             )
263             $(LI An abstract character does not necessarily
264                 correspond to what a user thinks of as a “character”
265                 and should not be confused with a $(LREF Grapheme).
266             )
267             $(LI The abstract characters encoded (see Encoded character)
268                 are known as Unicode abstract characters.
269             )
270             $(LI Abstract characters not directly
271                 encoded by the Unicode Standard can often be
272                 represented by the use of combining character sequences.
273             )
274         )
275     )
276     $(P $(DEF Canonical decomposition)
277         The decomposition of a character or character sequence
278         that results from recursively applying the canonical
279         mappings found in the Unicode Character Database
280         and these described in Conjoining Jamo Behavior
281         (section 12 of
282         $(HTTP www.unicode.org/uni2book/ch03.pdf, Unicode Conformance)).
283     )
284     $(P $(DEF Canonical composition)
285         The precise definition of the Canonical composition
286         is the algorithm as specified in $(HTTP www.unicode.org/uni2book/ch03.pdf,
287         Unicode Conformance) section 11.
288         Informally it's the process that does the reverse of the canonical
289         decomposition with the addition of certain rules
290         that e.g. prevent legacy characters from appearing in the composed result.
291     )
292     $(P $(DEF Canonical equivalent)
293         Two character sequences are said to be canonical equivalents if
294         their full canonical decompositions are identical.
295     )
296     $(P $(DEF Character) Typically differs by context.
297         For the purpose of this documentation the term $(I character)
298         implies $(I encoded character), that is, a code point having
299         an assigned abstract character (a symbolic meaning).
300     )
301     $(P $(DEF Code point) Any value in the Unicode codespace;
302         that is, the range of integers from 0 to 10FFFF (hex).
303         Not all code points are assigned to encoded characters.
304     )
305     $(P $(DEF Code unit) The minimal bit combination that can represent
306         a unit of encoded text for processing or interchange.
307         Depending on the encoding this could be:
308         8-bit code units in the UTF-8 (`char`),
309         16-bit code units in the UTF-16 (`wchar`),
310         and 32-bit code units in the UTF-32 (`dchar`).
311         $(I Note that in UTF-32, a code unit is a code point
312         and is represented by the D `dchar` type.)
313     )
314     $(P $(DEF Combining character) A character with the General Category
315         of Combining Mark(M).
316         $(UL
317             $(LI All characters with non-zero canonical combining class
318             are combining characters, but the reverse is not the case:
319             there are combining characters with a zero combining class.
320             )
321             $(LI These characters are not normally used in isolation
322             unless they are being described. They include such characters
323             as accents, diacritics, Hebrew points, Arabic vowel signs,
324             and Indic matras.
325             )
326         )
327     )
328     $(P $(DEF Combining class)
329         A numerical value used by the Unicode Canonical Ordering Algorithm
330         to determine which sequences of combining marks are to be
331         considered canonically equivalent and  which are not.
332     )
333     $(P $(DEF Compatibility decomposition)
334         The decomposition of a character or character sequence that results
335         from recursively applying both the compatibility mappings and
336         the canonical mappings found in the Unicode Character Database, and those
337         described in Conjoining Jamo Behavior no characters
338         can be further decomposed.
339     )
340     $(P $(DEF Compatibility equivalent)
341         Two character sequences are said to be compatibility
342         equivalents if their full compatibility decompositions are identical.
343     )
344     $(P $(DEF Encoded character) An association (or mapping)
345         between an abstract character and a code point.
346     )
347     $(P $(DEF Glyph) The actual, concrete image of a glyph representation
348         having been rasterized or otherwise imaged onto some display surface.
349     )
350     $(P $(DEF Grapheme base) A character with the property
351         Grapheme_Base, or any standard Korean syllable block.
352     )
353     $(P $(DEF Grapheme cluster) Defined as the text between
354         grapheme boundaries  as specified by Unicode Standard Annex #29,
355         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation).
356         Important general properties of a grapheme:
357         $(UL
358             $(LI The grapheme cluster represents a horizontally segmentable
359             unit of text, consisting of some grapheme base (which may
360             consist of a Korean syllable) together with any number of
361             nonspacing marks applied to it.
362             )
363             $(LI  A grapheme cluster typically starts with a grapheme base
364             and then extends across any subsequent sequence of nonspacing marks.
365             A grapheme cluster is most directly relevant to text rendering and
366             processes such as cursor placement and text selection in editing,
367             but may also be relevant to comparison and searching.
368             )
369             $(LI For many processes, a grapheme cluster behaves as if it was a
370             single character with the same properties as its grapheme base.
371             Effectively, nonspacing marks apply $(I graphically) to the base,
372             but do not change its properties.
373             )
374         )
375         $(P This module defines a number of primitives that work with graphemes:
376         $(LREF Grapheme), $(LREF decodeGrapheme) and $(LREF graphemeStride).
377         All of them are using $(I extended grapheme) boundaries
378         as defined in the aforementioned standard annex.
379         )
380     )
381     $(P $(DEF Nonspacing mark) A combining character with the
382         General Category of Nonspacing Mark (Mn) or Enclosing Mark (Me).
383     )
384     $(P $(DEF Spacing mark) A combining character that is not a nonspacing mark.
385     )
386     $(SECTION Normalization)
387     $(P The concepts of $(S_LINK Canonical equivalent, canonical equivalent)
388         or $(S_LINK Compatibility equivalent, compatibility equivalent)
389         characters in the Unicode Standard make it necessary to have a full, formal
390         definition of equivalence for Unicode strings.
391         String equivalence is determined by a process called normalization,
392         whereby strings are converted into forms which are compared
393         directly for identity. This is the primary goal of the normalization process,
394         see the function $(LREF normalize) to convert into any of
395         the four defined forms.
396     )
397     $(P A very important attribute of the Unicode Normalization Forms
398         is that they must remain stable between versions of the Unicode Standard.
399         A Unicode string normalized to a particular Unicode Normalization Form
400         in one version of the standard is guaranteed to remain in that Normalization
401         Form for implementations of future versions of the standard.
402     )
403     $(P The Unicode Standard specifies four normalization forms.
404         Informally, two of these forms are defined by maximal decomposition
405         of equivalent sequences, and two of these forms are defined
406         by maximal $(I composition) of equivalent sequences.
407             $(UL
408             $(LI Normalization Form D (NFD): The $(S_LINK Canonical decomposition,
409                 canonical decomposition) of a character sequence.)
410             $(LI Normalization Form KD (NFKD): The $(S_LINK Compatibility decomposition,
411                 compatibility decomposition) of a character sequence.)
412             $(LI Normalization Form C (NFC): The canonical composition of the
413                 $(S_LINK Canonical decomposition, canonical decomposition)
414                 of a coded character sequence.)
415             $(LI Normalization Form KC (NFKC): The canonical composition
416             of the $(S_LINK Compatibility decomposition,
417                 compatibility decomposition) of a character sequence)
418             )
419     )
420     $(P The choice of the normalization form depends on the particular use case.
421         NFC is the best form for general text, since it's more compatible with
422         strings converted from legacy encodings. NFKC is the preferred form for
423         identifiers, especially where there are security concerns. NFD and NFKD
424         are the most useful for internal processing.
425     )
426     $(SECTION Construction of lookup tables)
427     $(P The Unicode standard describes a set of algorithms that
428         depend on having the ability to quickly look up various properties
429         of a code point. Given the codespace of about 1 million $(CODEPOINTS),
430         it is not a trivial task to provide a space-efficient solution for
431         the multitude of properties.
432     )
433     $(P Common approaches such as hash-tables or binary search over
434         sorted code point intervals (as in $(LREF InversionList)) are insufficient.
435         Hash-tables have enormous memory footprint and binary search
436         over intervals is not fast enough for some heavy-duty algorithms.
437     )
438     $(P The recommended solution (see Unicode Implementation Guidelines)
439         is using multi-stage tables that are an implementation of the
440         $(HTTP en.wikipedia.org/wiki/Trie, Trie) data structure with integer
441         keys and a fixed number of stages. For the remainder of the section
442         this will be called a fixed trie. The following describes a particular
443         implementation that is aimed for the speed of access at the expense
444         of ideal size savings.
445     )
446     $(P Taking a 2-level Trie as an example the principle of operation is as follows.
447         Split the number of bits in a key (code point, 21 bits) into 2 components
448         (e.g. 15 and 8).  The first is the number of bits in the index of the trie
449          and the other is number of bits in each page of the trie.
450         The layout of the trie is then an array of size 2^^bits-of-index followed
451         an array of memory chunks of size 2^^bits-of-page/bits-per-element.
452     )
453     $(P The number of pages is variable (but not less then 1)
454         unlike the number of entries in the index. The slots of the index
455         all have to contain a number of a page that is present. The lookup is then
456         just a couple of operations - slice the upper bits,
457         lookup an index for these, take a page at this index and use
458         the lower bits as an offset within this page.
459 
460         Assuming that pages are laid out consequently
461         in one array at `pages`, the pseudo-code is:
462     )
463     ---
464     auto elemsPerPage = (2 ^^ bits_per_page) / Value.sizeOfInBits;
465     pages[index[n >> bits_per_page]][n & (elemsPerPage - 1)];
466     ---
467     $(P Where if `elemsPerPage` is a power of 2 the whole process is
468         a handful of simple instructions and 2 array reads. Subsequent levels
469         of the trie are introduced by recursing on this notion - the index array
470         is treated as values. The number of bits in index is then again
471         split into 2 parts, with pages over 'current-index' and the new 'upper-index'.
472     )
473 
474     $(P For completeness a level 1 trie is simply an array.
475         The current implementation takes advantage of bit-packing values
476         when the range is known to be limited in advance (such as `bool`).
477         See also $(LREF BitPacked) for enforcing it manually.
478         The major size advantage however comes from the fact
479         that multiple $(B identical pages on every level are merged) by construction.
480     )
481     $(P The process of constructing a trie is more involved and is hidden from
482         the user in a form of the convenience functions $(LREF codepointTrie),
483         $(LREF codepointSetTrie) and the even more convenient $(LREF toTrie).
484         In general a set or built-in AA with `dchar` type
485         can be turned into a trie. The trie object in this module
486         is read-only (immutable); it's effectively frozen after construction.
487     )
488     $(SECTION Unicode properties)
489     $(P This is a full list of Unicode properties accessible through $(LREF unicode)
490         with specific helpers per category nested within. Consult the
491         $(HTTP www.unicode.org/cldr/utility/properties.jsp, CLDR utility)
492         when in doubt about the contents of a particular set.
493     )
494     $(P General category sets listed below are only accessible with the
495         $(LREF unicode) shorthand accessor.)
496         $(BOOKTABLE $(B General category ),
497              $(TR $(TH Abb.) $(TH Long form)
498                 $(TH Abb.) $(TH Long form)$(TH Abb.) $(TH Long form))
499             $(TR $(TD L) $(TD Letter)
500                 $(TD Cn) $(TD Unassigned)  $(TD Po) $(TD Other_Punctuation))
501             $(TR $(TD Ll) $(TD Lowercase_Letter)
502                 $(TD Co) $(TD Private_Use) $(TD Ps) $(TD Open_Punctuation))
503             $(TR $(TD Lm) $(TD Modifier_Letter)
504                 $(TD Cs) $(TD Surrogate)   $(TD S) $(TD Symbol))
505             $(TR $(TD Lo) $(TD Other_Letter)
506                 $(TD N) $(TD Number)  $(TD Sc) $(TD Currency_Symbol))
507             $(TR $(TD Lt) $(TD Titlecase_Letter)
508               $(TD Nd) $(TD Decimal_Number)  $(TD Sk) $(TD Modifier_Symbol))
509             $(TR $(TD Lu) $(TD Uppercase_Letter)
510               $(TD Nl) $(TD Letter_Number)   $(TD Sm) $(TD Math_Symbol))
511             $(TR $(TD M) $(TD Mark)
512               $(TD No) $(TD Other_Number)    $(TD So) $(TD Other_Symbol))
513             $(TR $(TD Mc) $(TD Spacing_Mark)
514               $(TD P) $(TD Punctuation) $(TD Z) $(TD Separator))
515             $(TR $(TD Me) $(TD Enclosing_Mark)
516               $(TD Pc) $(TD Connector_Punctuation)   $(TD Zl) $(TD Line_Separator))
517             $(TR $(TD Mn) $(TD Nonspacing_Mark)
518               $(TD Pd) $(TD Dash_Punctuation)    $(TD Zp) $(TD Paragraph_Separator))
519             $(TR $(TD C) $(TD Other)
520               $(TD Pe) $(TD Close_Punctuation) $(TD Zs) $(TD Space_Separator))
521             $(TR $(TD Cc) $(TD Control) $(TD Pf)
522               $(TD Final_Punctuation)   $(TD -) $(TD Any))
523             $(TR $(TD Cf) $(TD Format)
524               $(TD Pi) $(TD Initial_Punctuation) $(TD -) $(TD ASCII))
525     )
526     $(P Sets for other commonly useful properties that are
527         accessible with $(LREF unicode):)
528         $(BOOKTABLE $(B Common binary properties),
529             $(TR $(TH Name) $(TH Name) $(TH Name))
530             $(TR $(TD Alphabetic)  $(TD Ideographic) $(TD Other_Uppercase))
531             $(TR $(TD ASCII_Hex_Digit) $(TD IDS_Binary_Operator) $(TD Pattern_Syntax))
532             $(TR $(TD Bidi_Control)    $(TD ID_Start)    $(TD Pattern_White_Space))
533             $(TR $(TD Cased)   $(TD IDS_Trinary_Operator)    $(TD Quotation_Mark))
534             $(TR $(TD Case_Ignorable)  $(TD Join_Control)    $(TD Radical))
535             $(TR $(TD Dash)    $(TD Logical_Order_Exception) $(TD Soft_Dotted))
536             $(TR $(TD Default_Ignorable_Code_Point)    $(TD Lowercase)   $(TD STerm))
537             $(TR $(TD Deprecated)  $(TD Math)    $(TD Terminal_Punctuation))
538             $(TR $(TD Diacritic)   $(TD Noncharacter_Code_Point) $(TD Unified_Ideograph))
539             $(TR $(TD Extender)    $(TD Other_Alphabetic)    $(TD Uppercase))
540             $(TR $(TD Grapheme_Base)   $(TD Other_Default_Ignorable_Code_Point)  $(TD Variation_Selector))
541             $(TR $(TD Grapheme_Extend) $(TD Other_Grapheme_Extend)   $(TD White_Space))
542             $(TR $(TD Grapheme_Link)   $(TD Other_ID_Continue)   $(TD XID_Continue))
543             $(TR $(TD Hex_Digit)   $(TD Other_ID_Start)  $(TD XID_Start))
544             $(TR $(TD Hyphen)  $(TD Other_Lowercase) )
545             $(TR $(TD ID_Continue) $(TD Other_Math)  )
546     )
547     $(P Below is the table with block names accepted by $(LREF unicode.block).
548         Note that the shorthand version $(LREF unicode) requires "In"
549         to be prepended to the names of blocks so as to disambiguate
550         scripts and blocks.
551     )
552     $(BOOKTABLE $(B Blocks),
553         $(TR $(TD Aegean Numbers)    $(TD Ethiopic Extended) $(TD Mongolian))
554         $(TR $(TD Alchemical Symbols)    $(TD Ethiopic Extended-A)   $(TD Musical Symbols))
555         $(TR $(TD Alphabetic Presentation Forms) $(TD Ethiopic Supplement)   $(TD Myanmar))
556         $(TR $(TD Ancient Greek Musical Notation)    $(TD General Punctuation)   $(TD Myanmar Extended-A))
557         $(TR $(TD Ancient Greek Numbers) $(TD Geometric Shapes)  $(TD New Tai Lue))
558         $(TR $(TD Ancient Symbols)   $(TD Georgian)  $(TD NKo))
559         $(TR $(TD Arabic)    $(TD Georgian Supplement)   $(TD Number Forms))
560         $(TR $(TD Arabic Extended-A) $(TD Glagolitic)    $(TD Ogham))
561         $(TR $(TD Arabic Mathematical Alphabetic Symbols)    $(TD Gothic)    $(TD Ol Chiki))
562         $(TR $(TD Arabic Presentation Forms-A)   $(TD Greek and Coptic)  $(TD Old Italic))
563         $(TR $(TD Arabic Presentation Forms-B)   $(TD Greek Extended)    $(TD Old Persian))
564         $(TR $(TD Arabic Supplement) $(TD Gujarati)  $(TD Old South Arabian))
565         $(TR $(TD Armenian)  $(TD Gurmukhi)  $(TD Old Turkic))
566         $(TR $(TD Arrows)    $(TD Halfwidth and Fullwidth Forms) $(TD Optical Character Recognition))
567         $(TR $(TD Avestan)   $(TD Hangul Compatibility Jamo) $(TD Oriya))
568         $(TR $(TD Balinese)  $(TD Hangul Jamo)   $(TD Osmanya))
569         $(TR $(TD Bamum) $(TD Hangul Jamo Extended-A)    $(TD Phags-pa))
570         $(TR $(TD Bamum Supplement)  $(TD Hangul Jamo Extended-B)    $(TD Phaistos Disc))
571         $(TR $(TD Basic Latin)   $(TD Hangul Syllables)  $(TD Phoenician))
572         $(TR $(TD Batak) $(TD Hanunoo)   $(TD Phonetic Extensions))
573         $(TR $(TD Bengali)   $(TD Hebrew)    $(TD Phonetic Extensions Supplement))
574         $(TR $(TD Block Elements)    $(TD High Private Use Surrogates)   $(TD Playing Cards))
575         $(TR $(TD Bopomofo)  $(TD High Surrogates)   $(TD Private Use Area))
576         $(TR $(TD Bopomofo Extended) $(TD Hiragana)  $(TD Rejang))
577         $(TR $(TD Box Drawing)   $(TD Ideographic Description Characters)    $(TD Rumi Numeral Symbols))
578         $(TR $(TD Brahmi)    $(TD Imperial Aramaic)  $(TD Runic))
579         $(TR $(TD Braille Patterns)  $(TD Inscriptional Pahlavi) $(TD Samaritan))
580         $(TR $(TD Buginese)  $(TD Inscriptional Parthian)    $(TD Saurashtra))
581         $(TR $(TD Buhid) $(TD IPA Extensions)    $(TD Sharada))
582         $(TR $(TD Byzantine Musical Symbols) $(TD Javanese)  $(TD Shavian))
583         $(TR $(TD Carian)    $(TD Kaithi)    $(TD Sinhala))
584         $(TR $(TD Chakma)    $(TD Kana Supplement)   $(TD Small Form Variants))
585         $(TR $(TD Cham)  $(TD Kanbun)    $(TD Sora Sompeng))
586         $(TR $(TD Cherokee)  $(TD Kangxi Radicals)   $(TD Spacing Modifier Letters))
587         $(TR $(TD CJK Compatibility) $(TD Kannada)   $(TD Specials))
588         $(TR $(TD CJK Compatibility Forms)   $(TD Katakana)  $(TD Sundanese))
589         $(TR $(TD CJK Compatibility Ideographs)  $(TD Katakana Phonetic Extensions)  $(TD Sundanese Supplement))
590         $(TR $(TD CJK Compatibility Ideographs Supplement)   $(TD Kayah Li)  $(TD Superscripts and Subscripts))
591         $(TR $(TD CJK Radicals Supplement)   $(TD Kharoshthi)    $(TD Supplemental Arrows-A))
592         $(TR $(TD CJK Strokes)   $(TD Khmer) $(TD Supplemental Arrows-B))
593         $(TR $(TD CJK Symbols and Punctuation)   $(TD Khmer Symbols) $(TD Supplemental Mathematical Operators))
594         $(TR $(TD CJK Unified Ideographs)    $(TD Lao)   $(TD Supplemental Punctuation))
595         $(TR $(TD CJK Unified Ideographs Extension A)    $(TD Latin-1 Supplement)    $(TD Supplementary Private Use Area-A))
596         $(TR $(TD CJK Unified Ideographs Extension B)    $(TD Latin Extended-A)  $(TD Supplementary Private Use Area-B))
597         $(TR $(TD CJK Unified Ideographs Extension C)    $(TD Latin Extended Additional) $(TD Syloti Nagri))
598         $(TR $(TD CJK Unified Ideographs Extension D)    $(TD Latin Extended-B)  $(TD Syriac))
599         $(TR $(TD Combining Diacritical Marks)   $(TD Latin Extended-C)  $(TD Tagalog))
600         $(TR $(TD Combining Diacritical Marks for Symbols)   $(TD Latin Extended-D)  $(TD Tagbanwa))
601         $(TR $(TD Combining Diacritical Marks Supplement)    $(TD Lepcha)    $(TD Tags))
602         $(TR $(TD Combining Half Marks)  $(TD Letterlike Symbols)    $(TD Tai Le))
603         $(TR $(TD Common Indic Number Forms) $(TD Limbu) $(TD Tai Tham))
604         $(TR $(TD Control Pictures)  $(TD Linear B Ideograms)    $(TD Tai Viet))
605         $(TR $(TD Coptic)    $(TD Linear B Syllabary)    $(TD Tai Xuan Jing Symbols))
606         $(TR $(TD Counting Rod Numerals) $(TD Lisu)  $(TD Takri))
607         $(TR $(TD Cuneiform) $(TD Low Surrogates)    $(TD Tamil))
608         $(TR $(TD Cuneiform Numbers and Punctuation) $(TD Lycian)    $(TD Telugu))
609         $(TR $(TD Currency Symbols)  $(TD Lydian)    $(TD Thaana))
610         $(TR $(TD Cypriot Syllabary) $(TD Mahjong Tiles) $(TD Thai))
611         $(TR $(TD Cyrillic)  $(TD Malayalam) $(TD Tibetan))
612         $(TR $(TD Cyrillic Extended-A)   $(TD Mandaic)   $(TD Tifinagh))
613         $(TR $(TD Cyrillic Extended-B)   $(TD Mathematical Alphanumeric Symbols) $(TD Transport And Map Symbols))
614         $(TR $(TD Cyrillic Supplement)   $(TD Mathematical Operators)    $(TD Ugaritic))
615         $(TR $(TD Deseret)   $(TD Meetei Mayek)  $(TD Unified Canadian Aboriginal Syllabics))
616         $(TR $(TD Devanagari)    $(TD Meetei Mayek Extensions)   $(TD Unified Canadian Aboriginal Syllabics Extended))
617         $(TR $(TD Devanagari Extended)   $(TD Meroitic Cursive)  $(TD Vai))
618         $(TR $(TD Dingbats)  $(TD Meroitic Hieroglyphs)  $(TD Variation Selectors))
619         $(TR $(TD Domino Tiles)  $(TD Miao)  $(TD Variation Selectors Supplement))
620         $(TR $(TD Egyptian Hieroglyphs)  $(TD Miscellaneous Mathematical Symbols-A)  $(TD Vedic Extensions))
621         $(TR $(TD Emoticons) $(TD Miscellaneous Mathematical Symbols-B)  $(TD Vertical Forms))
622         $(TR $(TD Enclosed Alphanumerics)    $(TD Miscellaneous Symbols) $(TD Yijing Hexagram Symbols))
623         $(TR $(TD Enclosed Alphanumeric Supplement)  $(TD Miscellaneous Symbols and Arrows)  $(TD Yi Radicals))
624         $(TR $(TD Enclosed CJK Letters and Months)   $(TD Miscellaneous Symbols And Pictographs) $(TD Yi Syllables))
625         $(TR $(TD Enclosed Ideographic Supplement)   $(TD Miscellaneous Technical)   )
626         $(TR $(TD Ethiopic)  $(TD Modifier Tone Letters) )
627     )
628     $(P Below is the table with script names accepted by $(LREF unicode.script)
629         and by the shorthand version $(LREF unicode):)
630         $(BOOKTABLE $(B Scripts),
631             $(TR $(TD Arabic)  $(TD Hanunoo) $(TD Old_Italic))
632             $(TR $(TD Armenian)    $(TD Hebrew)  $(TD Old_Persian))
633             $(TR $(TD Avestan) $(TD Hiragana)    $(TD Old_South_Arabian))
634             $(TR $(TD Balinese)    $(TD Imperial_Aramaic)    $(TD Old_Turkic))
635             $(TR $(TD Bamum)   $(TD Inherited)   $(TD Oriya))
636             $(TR $(TD Batak)   $(TD Inscriptional_Pahlavi)   $(TD Osmanya))
637             $(TR $(TD Bengali) $(TD Inscriptional_Parthian)  $(TD Phags_Pa))
638             $(TR $(TD Bopomofo)    $(TD Javanese)    $(TD Phoenician))
639             $(TR $(TD Brahmi)  $(TD Kaithi)  $(TD Rejang))
640             $(TR $(TD Braille) $(TD Kannada) $(TD Runic))
641             $(TR $(TD Buginese)    $(TD Katakana)    $(TD Samaritan))
642             $(TR $(TD Buhid)   $(TD Kayah_Li)    $(TD Saurashtra))
643             $(TR $(TD Canadian_Aboriginal) $(TD Kharoshthi)  $(TD Sharada))
644             $(TR $(TD Carian)  $(TD Khmer)   $(TD Shavian))
645             $(TR $(TD Chakma)  $(TD Lao) $(TD Sinhala))
646             $(TR $(TD Cham)    $(TD Latin)   $(TD Sora_Sompeng))
647             $(TR $(TD Cherokee)    $(TD Lepcha)  $(TD Sundanese))
648             $(TR $(TD Common)  $(TD Limbu)   $(TD Syloti_Nagri))
649             $(TR $(TD Coptic)  $(TD Linear_B)    $(TD Syriac))
650             $(TR $(TD Cuneiform)   $(TD Lisu)    $(TD Tagalog))
651             $(TR $(TD Cypriot) $(TD Lycian)  $(TD Tagbanwa))
652             $(TR $(TD Cyrillic)    $(TD Lydian)  $(TD Tai_Le))
653             $(TR $(TD Deseret) $(TD Malayalam)   $(TD Tai_Tham))
654             $(TR $(TD Devanagari)  $(TD Mandaic) $(TD Tai_Viet))
655             $(TR $(TD Egyptian_Hieroglyphs)    $(TD Meetei_Mayek)    $(TD Takri))
656             $(TR $(TD Ethiopic)    $(TD Meroitic_Cursive)    $(TD Tamil))
657             $(TR $(TD Georgian)    $(TD Meroitic_Hieroglyphs)    $(TD Telugu))
658             $(TR $(TD Glagolitic)  $(TD Miao)    $(TD Thaana))
659             $(TR $(TD Gothic)  $(TD Mongolian)   $(TD Thai))
660             $(TR $(TD Greek)   $(TD Myanmar) $(TD Tibetan))
661             $(TR $(TD Gujarati)    $(TD New_Tai_Lue) $(TD Tifinagh))
662             $(TR $(TD Gurmukhi)    $(TD Nko) $(TD Ugaritic))
663             $(TR $(TD Han) $(TD Ogham)   $(TD Vai))
664             $(TR $(TD Hangul)  $(TD Ol_Chiki)    $(TD Yi))
665     )
666     $(P Below is the table of names accepted by $(LREF unicode.hangulSyllableType).)
667         $(BOOKTABLE $(B Hangul syllable type),
668             $(TR $(TH Abb.) $(TH Long form))
669             $(TR $(TD L)   $(TD Leading_Jamo))
670             $(TR $(TD LV)  $(TD LV_Syllable))
671             $(TR $(TD LVT) $(TD LVT_Syllable) )
672             $(TR $(TD T)   $(TD Trailing_Jamo))
673             $(TR $(TD V)   $(TD Vowel_Jamo))
674     )
675     References:
676         $(HTTP www.digitalmars.com/d/ascii-table.html, ASCII Table),
677         $(HTTP en.wikipedia.org/wiki/Unicode, Wikipedia),
678         $(HTTP www.unicode.org, The Unicode Consortium),
679         $(HTTP www.unicode.org/reports/tr15/, Unicode normalization forms),
680         $(HTTP www.unicode.org/reports/tr29/, Unicode text segmentation)
681         $(HTTP www.unicode.org/uni2book/ch05.pdf,
682             Unicode Implementation Guidelines)
683         $(HTTP www.unicode.org/uni2book/ch03.pdf,
684             Unicode Conformance)
685     Trademarks:
686         Unicode(tm) is a trademark of Unicode, Inc.
687 
688     Copyright: Copyright 2013 -
689     License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
690     Authors:   Dmitry Olshansky
691     Source:    $(PHOBOSSRC std/uni/package.d)
692     Standards: $(HTTP www.unicode.org/versions/Unicode6.2.0/, Unicode v6.2)
693 
694 Macros:
695 
696 SECTION = <h3><a id="$1">$0</a></h3>
697 DEF = <div><a id="$1"><i>$0</i></a></div>
698 S_LINK = <a href="#$1">$+</a>
699 CODEPOINT = $(S_LINK Code point, code point)
700 CODEPOINTS = $(S_LINK Code point, code points)
701 CHARACTER = $(S_LINK Character, character)
702 CHARACTERS = $(S_LINK Character, characters)
703 CLUSTER = $(S_LINK Grapheme cluster, grapheme cluster)
704 +/
705 module std.uni;
706 
707 import std.meta : AliasSeq;
708 import std.range.primitives : back, ElementEncodingType, ElementType, empty,
709     front, hasLength, hasSlicing, isForwardRange, isInputRange,
710     isRandomAccessRange, popFront, put, save;
711 import std.traits : isConvertibleToString, isIntegral, isSomeChar,
712     isSomeString, Unqual, isDynamicArray;
713 // debug = std_uni;
714 
715 import std.internal.unicode_tables; // generated file
716 
717 debug(std_uni) import std.stdio; // writefln, writeln
718 
719 private:
720 
721 
722 void copyBackwards(T,U)(T[] src, U[] dest)
723 {
724     assert(src.length == dest.length);
725     for (size_t i=src.length; i-- > 0; )
726         dest[i] = src[i];
727 }
728 
729 void copyForward(T,U)(T[] src, U[] dest)
730 {
731     assert(src.length == dest.length);
732     for (size_t i=0; i<src.length; i++)
733         dest[i] = src[i];
734 }
735 
736 // TODO: update to reflect all major CPUs supporting unaligned reads
737 version (X86)
738     enum hasUnalignedReads = true;
739 else version (X86_64)
740     enum hasUnalignedReads = true;
741 else version (SystemZ)
742     enum hasUnalignedReads = true;
743 else
744     enum hasUnalignedReads = false; // better be safe then sorry
745 
746 public enum dchar lineSep = '\u2028'; /// Constant $(CODEPOINT) (0x2028) - line separator.
747 public enum dchar paraSep = '\u2029'; /// Constant $(CODEPOINT) (0x2029) - paragraph separator.
748 public enum dchar nelSep  = '\u0085'; /// Constant $(CODEPOINT) (0x0085) - next line.
749 
750 // test the intro example
751 @safe unittest
752 {
753     import std.algorithm.searching : find;
754     // initialize code point sets using script/block or property name
755     // set contains code points from both scripts.
756     auto set = unicode("Cyrillic") | unicode("Armenian");
757     // or simpler and statically-checked look
758     auto ascii = unicode.ASCII;
759     auto currency = unicode.Currency_Symbol;
760 
761     // easy set ops
762     auto a = set & ascii;
763     assert(a.empty); // as it has no intersection with ascii
764     a = set | ascii;
765     auto b = currency - a; // subtract all ASCII, Cyrillic and Armenian
766 
767     // some properties of code point sets
768     assert(b.length > 45); // 46 items in Unicode 6.1, even more in 6.2
769     // testing presence of a code point in a set
770     // is just fine, it is O(logN)
771     assert(!b['$']);
772     assert(!b['\u058F']); // Armenian dram sign
773     assert(b['¥']);
774 
775     // building fast lookup tables, these guarantee O(1) complexity
776     // 1-level Trie lookup table essentially a huge bit-set ~262Kb
777     auto oneTrie = toTrie!1(b);
778     // 2-level far more compact but typically slightly slower
779     auto twoTrie = toTrie!2(b);
780     // 3-level even smaller, and a bit slower yet
781     auto threeTrie = toTrie!3(b);
782     assert(oneTrie['£']);
783     assert(twoTrie['£']);
784     assert(threeTrie['£']);
785 
786     // build the trie with the most sensible trie level
787     // and bind it as a functor
788     auto cyrillicOrArmenian = toDelegate(set);
789     auto balance = find!(cyrillicOrArmenian)("Hello ընկեր!");
790     assert(balance == "ընկեր!");
791     // compatible with bool delegate(dchar)
792     bool delegate(dchar) bindIt = cyrillicOrArmenian;
793 
794     // Normalization
795     string s = "Plain ascii (and not only), is always normalized!";
796     assert(s is normalize(s));// is the same string
797 
798     string nonS = "A\u0308ffin"; // A ligature
799     auto nS = normalize(nonS); // to NFC, the W3C endorsed standard
800     assert(nS == "Äffin");
801     assert(nS != nonS);
802     string composed = "Äffin";
803 
804     assert(normalize!NFD(composed) == "A\u0308ffin");
805     // to NFKD, compatibility decomposition useful for fuzzy matching/searching
806     assert(normalize!NFKD("2¹⁰") == "210");
807 }
808 
809 enum lastDchar = 0x10FFFF;
810 
811 auto force(T, F)(F from)
812 if (isIntegral!T && !is(T == F))
813 {
814     assert(from <= T.max && from >= T.min);
815     return cast(T) from;
816 }
817 
818 auto force(T, F)(F from)
819 if (isBitPacked!T && !is(T == F))
820 {
821     assert(from <= 2^^bitSizeOf!T-1);
822     return T(cast(TypeOfBitPacked!T) from);
823 }
824 
825 auto force(T, F)(F from)
826 if (is(T == F))
827 {
828     return from;
829 }
830 
831 // repeat X times the bit-pattern in val assuming it's length is 'bits'
832 size_t replicateBits(size_t times, size_t bits)(size_t val) @safe pure nothrow @nogc
833 {
834     static if (times == 1)
835         return val;
836     else static if (bits == 1)
837     {
838         static if (times == size_t.sizeof*8)
839             return val ? size_t.max : 0;
840         else
841             return val ? (1 << times)-1 : 0;
842     }
843     else static if (times % 2)
844         return (replicateBits!(times-1, bits)(val)<<bits) | val;
845     else
846         return replicateBits!(times/2, bits*2)((val << bits) | val);
847 }
848 
849 @safe pure nothrow @nogc unittest // for replicate
850 {
851     import std.algorithm.iteration : sum, map;
852     import std.range : iota;
853     size_t m = 0b111;
854     size_t m2 = 0b01;
855     static foreach (i; AliasSeq!(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
856     {
857         assert(replicateBits!(i, 3)(m)+1 == (1<<(3*i)));
858         assert(replicateBits!(i, 2)(m2) == iota(0, i).map!"2^^(2*a)"().sum());
859     }
860 }
861 
862 // multiple arrays squashed into one memory block
863 struct MultiArray(Types...)
864 {
865     import std.range.primitives : isOutputRange;
866     this(size_t[] sizes...) @safe pure nothrow
867     {
868         assert(dim == sizes.length);
869         size_t full_size;
870         foreach (i, v; Types)
871         {
872             full_size += spaceFor!(bitSizeOf!v)(sizes[i]);
873             sz[i] = sizes[i];
874             static if (i >= 1)
875                 offsets[i] = offsets[i-1] +
876                     spaceFor!(bitSizeOf!(Types[i-1]))(sizes[i-1]);
877         }
878 
879         storage = new size_t[full_size];
880     }
881 
882     this(const(size_t)[] raw_offsets,
883         const(size_t)[] raw_sizes,
884         return scope const(size_t)[] data) return scope const @safe pure nothrow @nogc
885     {
886         offsets[] = raw_offsets[];
887         sz[] = raw_sizes[];
888         storage = data;
889     }
890 
891     @property auto slice(size_t n)()inout pure nothrow @nogc
892     {
893         auto ptr = raw_ptr!n;
894         return packedArrayView!(Types[n])(ptr, sz[n]);
895     }
896 
897     @property auto ptr(size_t n)()inout pure nothrow @nogc
898     {
899         auto ptr = raw_ptr!n;
900         return inout(PackedPtr!(Types[n]))(ptr);
901     }
902 
903     template length(size_t n)
904     {
905         @property size_t length()const @safe pure nothrow @nogc{ return sz[n]; }
906 
907         @property void length(size_t new_size)
908         {
909             if (new_size > sz[n])
910             {// extend
911                 size_t delta = (new_size - sz[n]);
912                 sz[n] += delta;
913                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
914                 storage.length +=  delta;// extend space at end
915                 // raw_slice!x must follow resize as it could be moved!
916                 // next stmts move all data past this array, last-one-goes-first
917                 static if (n != dim-1)
918                 {
919                     auto start = raw_ptr!(n+1);
920                     // len includes delta
921                     size_t len = (storage.ptr+storage.length-start);
922 
923                     copyBackwards(start[0 .. len-delta], start[delta .. len]);
924 
925                     start[0 .. delta] = 0;
926                     // offsets are used for raw_slice, ptr etc.
927                     foreach (i; n+1 .. dim)
928                         offsets[i] += delta;
929                 }
930             }
931             else if (new_size < sz[n])
932             {// shrink
933                 size_t delta = (sz[n] - new_size);
934                 sz[n] -= delta;
935                 delta = spaceFor!(bitSizeOf!(Types[n]))(delta);
936                 // move all data past this array, forward direction
937                 static if (n != dim-1)
938                 {
939                     auto start = raw_ptr!(n+1);
940                     size_t len = (storage.ptr+storage.length-start);
941                     copyForward(start[0 .. len-delta], start[delta .. len]);
942 
943                     // adjust offsets last, they affect raw_slice
944                     foreach (i; n+1 .. dim)
945                         offsets[i] -= delta;
946                 }
947                 storage.length -= delta;
948             }
949             // else - NOP
950         }
951     }
952 
953     @property size_t bytes(size_t n=size_t.max)() const @safe
954     {
955         static if (n == size_t.max)
956             return storage.length*size_t.sizeof;
957         else static if (n != Types.length-1)
958             return (raw_ptr!(n+1)-raw_ptr!n)*size_t.sizeof;
959         else
960             return (storage.ptr+storage.length - raw_ptr!n)*size_t.sizeof;
961     }
962 
963     void store(OutRange)(scope OutRange sink) const
964         if (isOutputRange!(OutRange, char))
965     {
966         import std.format.write : formattedWrite;
967         formattedWrite(sink, "[%( 0x%x, %)]", offsets[]);
968         formattedWrite(sink, ", [%( 0x%x, %)]", sz[]);
969         formattedWrite(sink, ", [%( 0x%x, %)]", storage);
970     }
971 
972 private:
973     import std.meta : staticMap;
974     @property auto raw_ptr(size_t n)()inout pure nothrow @nogc
975     {
976         static if (n == 0)
977             return storage.ptr;
978         else
979         {
980             return storage.ptr+offsets[n];
981         }
982     }
983     enum dim = Types.length;
984     size_t[dim] offsets;// offset for level x
985     size_t[dim] sz;// size of level x
986     alias bitWidth = staticMap!(bitSizeOf, Types);
987     size_t[] storage;
988 }
989 
990 @system unittest
991 {
992     import std.conv : text;
993     enum dg = (){
994         // sizes are:
995         // lvl0: 3, lvl1 : 2, lvl2: 1
996         auto m = MultiArray!(int, ubyte, int)(3,2,1);
997 
998         static void check(size_t k, T)(ref T m, int n)
999         {
1000             foreach (i; 0 .. n)
1001                 assert(m.slice!(k)[i] == i+1, text("level:",i," : ",m.slice!(k)[0 .. n]));
1002         }
1003 
1004         static void checkB(size_t k, T)(ref T m, int n)
1005         {
1006             foreach (i; 0 .. n)
1007                 assert(m.slice!(k)[i] == n-i, text("level:",i," : ",m.slice!(k)[0 .. n]));
1008         }
1009 
1010         static void fill(size_t k, T)(ref T m, int n)
1011         {
1012             foreach (i; 0 .. n)
1013                 m.slice!(k)[i] = force!ubyte(i+1);
1014         }
1015 
1016         static void fillB(size_t k, T)(ref T m, int n)
1017         {
1018             foreach (i; 0 .. n)
1019                 m.slice!(k)[i] = force!ubyte(n-i);
1020         }
1021 
1022         m.length!1 = 100;
1023         fill!1(m, 100);
1024         check!1(m, 100);
1025 
1026         m.length!0 = 220;
1027         fill!0(m, 220);
1028         check!1(m, 100);
1029         check!0(m, 220);
1030 
1031         m.length!2 = 17;
1032         fillB!2(m, 17);
1033         checkB!2(m, 17);
1034         check!0(m, 220);
1035         check!1(m, 100);
1036 
1037         m.length!2 = 33;
1038         checkB!2(m, 17);
1039         fillB!2(m, 33);
1040         checkB!2(m, 33);
1041         check!0(m, 220);
1042         check!1(m, 100);
1043 
1044         m.length!1 = 195;
1045         fillB!1(m, 195);
1046         checkB!1(m, 195);
1047         checkB!2(m, 33);
1048         check!0(m, 220);
1049 
1050         auto marr = MultiArray!(BitPacked!(uint, 4), BitPacked!(uint, 6))(20, 10);
1051         marr.length!0 = 15;
1052         marr.length!1 = 30;
1053         fill!1(marr, 30);
1054         fill!0(marr, 15);
1055         check!1(marr, 30);
1056         check!0(marr, 15);
1057         return 0;
1058     };
1059     enum ct = dg();
1060     auto rt = dg();
1061 }
1062 
1063 @system unittest
1064 {// more bitpacking tests
1065     import std.conv : text;
1066 
1067     alias Bitty =
1068       MultiArray!(BitPacked!(size_t, 3)
1069                 , BitPacked!(size_t, 4)
1070                 , BitPacked!(size_t, 3)
1071                 , BitPacked!(size_t, 6)
1072                 , bool);
1073     alias fn1 = sliceBits!(13, 16);
1074     alias fn2 = sliceBits!( 9, 13);
1075     alias fn3 = sliceBits!( 6,  9);
1076     alias fn4 = sliceBits!( 0,  6);
1077     static void check(size_t lvl, MA)(ref MA arr){
1078         for (size_t i = 0; i< arr.length!lvl; i++)
1079             assert(arr.slice!(lvl)[i] == i, text("Mismatch on lvl ", lvl, " idx ", i, " value: ", arr.slice!(lvl)[i]));
1080     }
1081 
1082     static void fillIdx(size_t lvl, MA)(ref MA arr){
1083         for (size_t i = 0; i< arr.length!lvl; i++)
1084             arr.slice!(lvl)[i] = i;
1085     }
1086     Bitty m1;
1087 
1088     m1.length!4 = 10;
1089     m1.length!3 = 2^^6;
1090     m1.length!2 = 2^^3;
1091     m1.length!1 = 2^^4;
1092     m1.length!0 = 2^^3;
1093 
1094     m1.length!4 = 2^^16;
1095 
1096     for (size_t i = 0; i< m1.length!4; i++)
1097         m1.slice!(4)[i] = i % 2;
1098 
1099     fillIdx!1(m1);
1100     check!1(m1);
1101     fillIdx!2(m1);
1102     check!2(m1);
1103     fillIdx!3(m1);
1104     check!3(m1);
1105     fillIdx!0(m1);
1106     check!0(m1);
1107     check!3(m1);
1108     check!2(m1);
1109     check!1(m1);
1110     for (size_t i=0; i < 2^^16; i++)
1111     {
1112         m1.slice!(4)[i] = i % 2;
1113         m1.slice!(0)[fn1(i)] = fn1(i);
1114         m1.slice!(1)[fn2(i)] = fn2(i);
1115         m1.slice!(2)[fn3(i)] = fn3(i);
1116         m1.slice!(3)[fn4(i)] = fn4(i);
1117     }
1118     for (size_t i=0; i < 2^^16; i++)
1119     {
1120         assert(m1.slice!(4)[i] == i % 2);
1121         assert(m1.slice!(0)[fn1(i)] == fn1(i));
1122         assert(m1.slice!(1)[fn2(i)] == fn2(i));
1123         assert(m1.slice!(2)[fn3(i)] == fn3(i));
1124         assert(m1.slice!(3)[fn4(i)] == fn4(i));
1125     }
1126 }
1127 
1128 size_t spaceFor(size_t _bits)(size_t new_len) @safe pure nothrow @nogc
1129 {
1130     import std.math.algebraic : nextPow2;
1131     enum bits = _bits == 1 ? 1 : nextPow2(_bits - 1);// see PackedArrayView
1132     static if (bits > 8*size_t.sizeof)
1133     {
1134         static assert(bits % (size_t.sizeof*8) == 0);
1135         return new_len * bits/(8*size_t.sizeof);
1136     }
1137     else
1138     {
1139         enum factor = size_t.sizeof*8/bits;
1140         return (new_len+factor-1)/factor; // rounded up
1141     }
1142 }
1143 
1144 template isBitPackableType(T)
1145 {
1146     enum isBitPackableType = isBitPacked!T
1147         || isIntegral!T || is(T == bool) || isSomeChar!T;
1148 }
1149 
1150 //============================================================================
1151 template PackedArrayView(T)
1152 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1153     && isBitPackableType!U) || isBitPackableType!T)
1154 {
1155     import std.math.algebraic : nextPow2;
1156     private enum bits = bitSizeOf!T;
1157     alias PackedArrayView = PackedArrayViewImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1158 }
1159 
1160 //unsafe and fast access to a chunk of RAM as if it contains packed values
1161 template PackedPtr(T)
1162 if ((is(T dummy == BitPacked!(U, sz), U, size_t sz)
1163     && isBitPackableType!U) || isBitPackableType!T)
1164 {
1165     import std.math.algebraic : nextPow2;
1166     private enum bits = bitSizeOf!T;
1167     alias PackedPtr = PackedPtrImpl!(T, bits > 1 ? nextPow2(bits - 1) : 1);
1168 }
1169 
1170 struct PackedPtrImpl(T, size_t bits)
1171 {
1172 pure nothrow:
1173     static assert(isPow2OrZero(bits));
1174 
1175     this(inout(size_t)* ptr)inout @safe @nogc
1176     {
1177         origin = ptr;
1178     }
1179 
1180     private T simpleIndex(size_t n) inout
1181     {
1182         immutable q = n / factor;
1183         immutable r = n % factor;
1184         return cast(T)((origin[q] >> bits*r) & mask);
1185     }
1186 
1187     private void simpleWrite(TypeOfBitPacked!T val, size_t n)
1188     in
1189     {
1190         static if (isIntegral!T)
1191             assert(val <= mask);
1192     }
1193     do
1194     {
1195         immutable q = n / factor;
1196         immutable r = n % factor;
1197         immutable tgt_shift = bits*r;
1198         immutable word = origin[q];
1199         origin[q] = (word & ~(mask << tgt_shift))
1200             | (cast(size_t) val << tgt_shift);
1201     }
1202 
1203     static if (factor == bytesPerWord// can safely pack by byte
1204          || factor == 1 // a whole word at a time
1205          || ((factor == bytesPerWord/2 || factor == bytesPerWord/4)
1206                 && hasUnalignedReads)) // this needs unaligned reads
1207     {
1208         static if (factor == bytesPerWord)
1209             alias U = ubyte;
1210         else static if (factor == bytesPerWord/2)
1211             alias U = ushort;
1212         else static if (factor == bytesPerWord/4)
1213             alias U = uint;
1214         else static if (size_t.sizeof == 8 && factor == bytesPerWord/8)
1215             alias U = ulong;
1216 
1217         T opIndex(size_t idx) inout
1218         {
1219             T ret;
1220             version (LittleEndian)
1221                 ret = __ctfe ? simpleIndex(idx) :
1222                     cast(inout(T))(cast(U*) origin)[idx];
1223             else
1224                 ret = simpleIndex(idx);
1225             return ret;
1226         }
1227 
1228         static if (isBitPacked!T) // lack of user-defined implicit conversion
1229         {
1230             void opIndexAssign(T val, size_t idx)
1231             {
1232                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1233             }
1234         }
1235 
1236         void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1237         {
1238             version (LittleEndian)
1239             {
1240                 if (__ctfe)
1241                     simpleWrite(val, idx);
1242                 else
1243                     (cast(U*) origin)[idx] = cast(U) val;
1244             }
1245             else
1246                 simpleWrite(val, idx);
1247         }
1248     }
1249     else
1250     {
1251         T opIndex(size_t n) inout
1252         {
1253             return simpleIndex(n);
1254         }
1255 
1256         static if (isBitPacked!T) // lack of user-defined implicit conversion
1257         {
1258             void opIndexAssign(T val, size_t idx)
1259             {
1260                 return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1261             }
1262         }
1263 
1264         void opIndexAssign(TypeOfBitPacked!T val, size_t n)
1265         {
1266             return simpleWrite(val, n);
1267         }
1268     }
1269 
1270 private:
1271     // factor - number of elements in one machine word
1272     enum factor = size_t.sizeof*8/bits, mask = 2^^bits-1;
1273     enum bytesPerWord =  size_t.sizeof;
1274     size_t* origin;
1275 }
1276 
1277 // data is packed only by power of two sized packs per word,
1278 // thus avoiding mul/div overhead at the cost of ultimate packing
1279 // this construct doesn't own memory, only provides access, see MultiArray for usage
1280 struct PackedArrayViewImpl(T, size_t bits)
1281 {
1282 pure nothrow:
1283 
1284     this(inout(size_t)* origin, size_t offset, size_t items) inout @safe
1285     {
1286         ptr = inout(PackedPtr!(T))(origin);
1287         ofs = offset;
1288         limit = items;
1289     }
1290 
1291     bool zeros(size_t s, size_t e)
1292     in
1293     {
1294         assert(s <= e);
1295     }
1296     do
1297     {
1298         s += ofs;
1299         e += ofs;
1300         immutable pad_s = roundUp(s);
1301         if ( s >= e)
1302         {
1303             foreach (i; s .. e)
1304                 if (ptr[i])
1305                     return false;
1306             return true;
1307         }
1308         immutable pad_e = roundDown(e);
1309         size_t i;
1310         for (i=s; i<pad_s; i++)
1311             if (ptr[i])
1312                 return false;
1313         // all in between is x*factor elements
1314         for (size_t j=i/factor; i<pad_e; i+=factor, j++)
1315             if (ptr.origin[j])
1316                 return false;
1317         for (; i<e; i++)
1318             if (ptr[i])
1319                 return false;
1320         return true;
1321     }
1322 
1323     T opIndex(size_t idx) inout
1324     in
1325     {
1326         assert(idx < limit);
1327     }
1328     do
1329     {
1330         return ptr[ofs + idx];
1331     }
1332 
1333     static if (isBitPacked!T) // lack of user-defined implicit conversion
1334     {
1335         void opIndexAssign(T val, size_t idx)
1336         {
1337             return opIndexAssign(cast(TypeOfBitPacked!T) val, idx);
1338         }
1339     }
1340 
1341     void opIndexAssign(TypeOfBitPacked!T val, size_t idx)
1342     in
1343     {
1344         assert(idx < limit);
1345     }
1346     do
1347     {
1348         ptr[ofs + idx] = val;
1349     }
1350 
1351     static if (isBitPacked!T) // lack of user-defined implicit conversions
1352     {
1353         void opSliceAssign(T val, size_t start, size_t end)
1354         {
1355             opSliceAssign(cast(TypeOfBitPacked!T) val, start, end);
1356         }
1357     }
1358 
1359     void opSliceAssign(TypeOfBitPacked!T val, size_t start, size_t end)
1360     in
1361     {
1362         assert(start <= end);
1363         assert(end <= limit);
1364     }
1365     do
1366     {
1367         // account for ofsetted view
1368         start += ofs;
1369         end += ofs;
1370         // rounded to factor granularity
1371         immutable pad_start = roundUp(start);// rounded up
1372         if (pad_start >= end) //rounded up >= then end of slice
1373         {
1374             //nothing to gain, use per element assignment
1375             foreach (i; start .. end)
1376                 ptr[i] = val;
1377             return;
1378         }
1379         immutable pad_end = roundDown(end); // rounded down
1380         size_t i;
1381         for (i=start; i<pad_start; i++)
1382             ptr[i] = val;
1383         // all in between is x*factor elements
1384         if (pad_start != pad_end)
1385         {
1386             immutable repval = replicateBits!(factor, bits)(val);
1387             for (size_t j=i/factor; i<pad_end; i+=factor, j++)
1388                 ptr.origin[j] = repval;// so speed it up by factor
1389         }
1390         for (; i<end; i++)
1391             ptr[i] = val;
1392     }
1393 
1394     auto opSlice(size_t from, size_t to)inout
1395     in
1396     {
1397         assert(from <= to);
1398         assert(ofs + to <= limit);
1399     }
1400     do
1401     {
1402         return typeof(this)(ptr.origin, ofs + from, to - from);
1403     }
1404 
1405     auto opSlice(){ return opSlice(0, length); }
1406 
1407     bool opEquals(T)(auto ref T arr) const
1408     {
1409         if (limit != arr.limit)
1410            return false;
1411         size_t s1 = ofs, s2 = arr.ofs;
1412         size_t e1 = s1 + limit, e2 = s2 + limit;
1413         if (s1 % factor == 0 && s2 % factor == 0 && length % factor == 0)
1414         {
1415             return ptr.origin[s1/factor .. e1/factor]
1416                 == arr.ptr.origin[s2/factor .. e2/factor];
1417         }
1418         for (size_t i=0;i<limit; i++)
1419             if (this[i] != arr[i])
1420                 return false;
1421         return true;
1422     }
1423 
1424     @property size_t length()const{ return limit; }
1425 
1426 private:
1427     auto roundUp()(size_t val){ return (val+factor-1)/factor*factor; }
1428     auto roundDown()(size_t val){ return val/factor*factor; }
1429     // factor - number of elements in one machine word
1430     enum factor = size_t.sizeof*8/bits;
1431     PackedPtr!(T) ptr;
1432     size_t ofs, limit;
1433 }
1434 
1435 
1436 private struct SliceOverIndexed(T)
1437 {
1438     enum assignableIndex = is(typeof((){ T.init[0] = Item.init; }));
1439     enum assignableSlice = is(typeof((){ T.init[0 .. 0] = Item.init; }));
1440     auto opIndex(size_t idx)const
1441     in
1442     {
1443         assert(idx < to - from);
1444     }
1445     do
1446     {
1447         return (*arr)[from+idx];
1448     }
1449 
1450     static if (assignableIndex)
1451     void opIndexAssign(Item val, size_t idx)
1452     in
1453     {
1454         assert(idx < to - from);
1455     }
1456     do
1457     {
1458        (*arr)[from+idx] = val;
1459     }
1460 
1461     auto opSlice(size_t a, size_t b)
1462     {
1463         return typeof(this)(from+a, from+b, arr);
1464     }
1465 
1466     // static if (assignableSlice)
1467     void opSliceAssign(T)(T val, size_t start, size_t end)
1468     {
1469         (*arr)[start+from .. end+from] = val;
1470     }
1471 
1472     auto opSlice()
1473     {
1474         return typeof(this)(from, to, arr);
1475     }
1476 
1477     @property size_t length()const { return to-from;}
1478 
1479     alias opDollar = length;
1480 
1481     @property bool empty()const { return from == to; }
1482 
1483     @property auto front()const { return (*arr)[from]; }
1484 
1485     static if (assignableIndex)
1486     @property void front(Item val) { (*arr)[from] = val; }
1487 
1488     @property auto back()const { return (*arr)[to-1]; }
1489 
1490     static if (assignableIndex)
1491     @property void back(Item val) { (*arr)[to-1] = val; }
1492 
1493     @property auto save() inout { return this; }
1494 
1495     void popFront() {   from++; }
1496 
1497     void popBack() {    to--; }
1498 
1499     bool opEquals(T)(auto ref T arr) const
1500     {
1501         if (arr.length != length)
1502             return false;
1503         for (size_t i=0; i <length; i++)
1504             if (this[i] != arr[i])
1505                 return false;
1506         return true;
1507     }
1508 private:
1509     alias Item = typeof(T.init[0]);
1510     size_t from, to;
1511     T* arr;
1512 }
1513 
1514 @safe pure nothrow @nogc unittest
1515 {
1516     static assert(isRandomAccessRange!(SliceOverIndexed!(int[])));
1517 }
1518 
1519 SliceOverIndexed!(const(T)) sliceOverIndexed(T)(size_t a, size_t b, const(T)* x)
1520 if (is(Unqual!T == T))
1521 {
1522     return SliceOverIndexed!(const(T))(a, b, x);
1523 }
1524 
1525 // BUG? inout is out of reach
1526 //...SliceOverIndexed.arr only parameters or stack based variables can be inout
1527 SliceOverIndexed!T sliceOverIndexed(T)(size_t a, size_t b, T* x)
1528 if (is(Unqual!T == T))
1529 {
1530     return SliceOverIndexed!T(a, b, x);
1531 }
1532 
1533 @system unittest
1534 {
1535     int[] idxArray = [2, 3, 5, 8, 13];
1536     auto sliced = sliceOverIndexed(0, idxArray.length, &idxArray);
1537 
1538     assert(!sliced.empty);
1539     assert(sliced.front == 2);
1540     sliced.front = 1;
1541     assert(sliced.front == 1);
1542     assert(sliced.back == 13);
1543     sliced.popFront();
1544     assert(sliced.front == 3);
1545     assert(sliced.back == 13);
1546     sliced.back = 11;
1547     assert(sliced.back == 11);
1548     sliced.popBack();
1549 
1550     assert(sliced.front == 3);
1551     assert(sliced[$-1] == 8);
1552     sliced = sliced[];
1553     assert(sliced[0] == 3);
1554     assert(sliced.back == 8);
1555     sliced = sliced[1..$];
1556     assert(sliced.front == 5);
1557     sliced = sliced[0..$-1];
1558     assert(sliced[$-1] == 5);
1559 
1560     int[] other = [2, 5];
1561     assert(sliced[] == sliceOverIndexed(1, 2, &other));
1562     sliceOverIndexed(0, 2, &idxArray)[0 .. 2] = -1;
1563     assert(idxArray[0 .. 2] == [-1, -1]);
1564     uint[] nullArr = null;
1565     auto nullSlice = sliceOverIndexed(0, 0, &idxArray);
1566     assert(nullSlice.empty);
1567 }
1568 
1569 private inout(PackedArrayView!T) packedArrayView(T)(inout(size_t)* ptr, size_t items)
1570 {
1571     return inout(PackedArrayView!T)(ptr, 0, items);
1572 }
1573 
1574 
1575 //============================================================================
1576 // Partially unrolled binary search using Shar's method
1577 //============================================================================
1578 
1579 string genUnrolledSwitchSearch(size_t size) @safe pure nothrow
1580 {
1581     import core.bitop : bsr;
1582     import std.array : replace;
1583     import std.conv : to;
1584     assert(isPow2OrZero(size));
1585     string code = `
1586     import core.bitop : bsr;
1587     auto power = bsr(m)+1;
1588     switch (power){`;
1589     size_t i = bsr(size);
1590     foreach_reverse (val; 0 .. bsr(size))
1591     {
1592         auto v = 2^^val;
1593         code ~= `
1594         case pow:
1595             if (pred(range[idx+m], needle))
1596                 idx +=  m;
1597             goto case;
1598         `.replace("m", to!string(v))
1599         .replace("pow", to!string(i));
1600         i--;
1601     }
1602     code ~= `
1603         case 0:
1604             if (pred(range[idx], needle))
1605                 idx += 1;
1606             goto default;
1607         `;
1608     code ~= `
1609         default:
1610     }`;
1611     return code;
1612 }
1613 
1614 bool isPow2OrZero(size_t sz) @safe pure nothrow @nogc
1615 {
1616     // See also: std.math.isPowerOf2()
1617     return (sz & (sz-1)) == 0;
1618 }
1619 
1620 size_t uniformLowerBound(alias pred, Range, T)(Range range, T needle)
1621 if (is(T : ElementType!Range))
1622 {
1623     assert(isPow2OrZero(range.length));
1624     size_t idx = 0, m = range.length/2;
1625     while (m != 0)
1626     {
1627         if (pred(range[idx+m], needle))
1628             idx += m;
1629         m /= 2;
1630     }
1631     if (pred(range[idx], needle))
1632         idx += 1;
1633     return idx;
1634 }
1635 
1636 size_t switchUniformLowerBound(alias pred, Range, T)(Range range, T needle)
1637 if (is(T : ElementType!Range))
1638 {
1639     assert(isPow2OrZero(range.length));
1640     size_t idx = 0, m = range.length/2;
1641     enum max = 1 << 10;
1642     while (m >= max)
1643     {
1644         if (pred(range[idx+m], needle))
1645             idx += m;
1646         m /= 2;
1647     }
1648     mixin(genUnrolledSwitchSearch(max));
1649     return idx;
1650 }
1651 
1652 template sharMethod(alias uniLowerBound)
1653 {
1654     size_t sharMethod(alias _pred="a<b", Range, T)(Range range, T needle)
1655         if (is(T : ElementType!Range))
1656     {
1657         import std.functional : binaryFun;
1658         import std.math.algebraic : nextPow2, truncPow2;
1659         alias pred = binaryFun!_pred;
1660         if (range.length == 0)
1661             return 0;
1662         if (isPow2OrZero(range.length))
1663             return uniLowerBound!pred(range, needle);
1664         size_t n = truncPow2(range.length);
1665         if (pred(range[n-1], needle))
1666         {// search in another 2^^k area that fully covers the tail of range
1667             size_t k = nextPow2(range.length - n + 1);
1668             return range.length - k + uniLowerBound!pred(range[$-k..$], needle);
1669         }
1670         else
1671             return uniLowerBound!pred(range[0 .. n], needle);
1672     }
1673 }
1674 
1675 alias sharLowerBound = sharMethod!uniformLowerBound;
1676 alias sharSwitchLowerBound = sharMethod!switchUniformLowerBound;
1677 
1678 @safe unittest
1679 {
1680     import std.array : array;
1681     import std.range : assumeSorted, iota;
1682 
1683     auto stdLowerBound(T)(T[] range, T needle)
1684     {
1685         return assumeSorted(range).lowerBound(needle).length;
1686     }
1687     immutable MAX = 5*1173;
1688     auto arr = array(iota(5, MAX, 5));
1689     assert(arr.length == MAX/5-1);
1690     foreach (i; 0 .. MAX+5)
1691     {
1692         auto st = stdLowerBound(arr, i);
1693         assert(st == sharLowerBound(arr, i));
1694         assert(st == sharSwitchLowerBound(arr, i));
1695     }
1696     arr = [];
1697     auto st = stdLowerBound(arr, 33);
1698     assert(st == sharLowerBound(arr, 33));
1699     assert(st == sharSwitchLowerBound(arr, 33));
1700 }
1701 //============================================================================
1702 
1703 @safe
1704 {
1705 // hope to see simillar stuff in public interface... once Allocators are out
1706 //@@@BUG moveFront and friends? dunno, for now it's POD-only
1707 
1708 @trusted size_t genericReplace(Policy=void, T, Range)
1709     (ref T dest, size_t from, size_t to, Range stuff)
1710 {
1711     import std.algorithm.mutation : copy;
1712     size_t delta = to - from;
1713     size_t stuff_end = from+stuff.length;
1714     if (stuff.length > delta)
1715     {// replace increases length
1716         delta = stuff.length - delta;// now, new is > old  by delta
1717         static if (is(Policy == void))
1718             dest.length = dest.length+delta;//@@@BUG lame @property
1719         else
1720             dest = Policy.realloc(dest, dest.length+delta);
1721         copyBackwards(dest[to .. dest.length-delta],
1722             dest[to+delta .. dest.length]);
1723         copyForward(stuff, dest[from .. stuff_end]);
1724     }
1725     else if (stuff.length == delta)
1726     {
1727         copy(stuff, dest[from .. to]);
1728     }
1729     else
1730     {// replace decreases length by delta
1731         delta = delta - stuff.length;
1732         copy(stuff, dest[from .. stuff_end]);
1733         copyForward(dest[to .. dest.length],
1734             dest[stuff_end .. dest.length-delta]);
1735         static if (is(Policy == void))
1736             dest.length = dest.length - delta;//@@@BUG lame @property
1737         else
1738             dest = Policy.realloc(dest, dest.length-delta);
1739     }
1740     return stuff_end;
1741 }
1742 
1743 
1744 // Simple storage manipulation policy
1745 @safe private struct GcPolicy
1746 {
1747     import std.traits : isDynamicArray;
1748 
1749     static T[] dup(T)(const T[] arr)
1750     {
1751         return arr.dup;
1752     }
1753 
1754     static T[] alloc(T)(size_t size)
1755     {
1756         return new T[size];
1757     }
1758 
1759     static T[] realloc(T)(T[] arr, size_t sz)
1760     {
1761         arr.length = sz;
1762         return arr;
1763     }
1764 
1765     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1766     {
1767         replaceInPlace(dest, from, to, stuff);
1768     }
1769 
1770     static void append(T, V)(ref T[] arr, V value)
1771         if (!isInputRange!V)
1772     {
1773         arr ~= force!T(value);
1774     }
1775 
1776     static void append(T, V)(ref T[] arr, V value)
1777         if (isInputRange!V)
1778     {
1779         insertInPlace(arr, arr.length, value);
1780     }
1781 
1782     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1783         if (isDynamicArray!T && is(Unqual!T == T))
1784     {
1785         debug
1786         {
1787             arr[] = cast(typeof(T.init[0]))(0xdead_beef);
1788         }
1789         arr = null;
1790     }
1791 
1792     static void destroy(T)(ref T arr) pure // pure required for -dip25, inferred for -dip1000
1793         if (isDynamicArray!T && !is(Unqual!T == T))
1794     {
1795         arr = null;
1796     }
1797 }
1798 
1799 // ditto
1800 @safe struct ReallocPolicy
1801 {
1802     import std.range.primitives : hasLength;
1803 
1804     static T[] dup(T)(const T[] arr)
1805     {
1806         auto result = alloc!T(arr.length);
1807         result[] = arr[];
1808         return result;
1809     }
1810 
1811     static T[] alloc(T)(size_t size) @trusted
1812     {
1813         import std.internal.memory : enforceMalloc;
1814 
1815         import core.checkedint : mulu;
1816         bool overflow;
1817         size_t nbytes = mulu(size, T.sizeof, overflow);
1818         if (overflow) assert(0);
1819 
1820         auto ptr = cast(T*) enforceMalloc(nbytes);
1821         return ptr[0 .. size];
1822     }
1823 
1824     static T[] realloc(T)(return scope T[] arr, size_t size) @trusted
1825     {
1826         import std.internal.memory : enforceRealloc;
1827         if (!size)
1828         {
1829             destroy(arr);
1830             return null;
1831         }
1832 
1833         import core.checkedint : mulu;
1834         bool overflow;
1835         size_t nbytes = mulu(size, T.sizeof, overflow);
1836         if (overflow) assert(0);
1837 
1838         auto ptr = cast(T*) enforceRealloc(arr.ptr, nbytes);
1839         return ptr[0 .. size];
1840     }
1841 
1842     static void replaceImpl(T, Range)(ref T[] dest, size_t from, size_t to, Range stuff)
1843     {
1844         genericReplace!(ReallocPolicy)(dest, from, to, stuff);
1845     }
1846 
1847     static void append(T, V)(ref T[] arr, V value)
1848         if (!isInputRange!V)
1849     {
1850         if (arr.length == size_t.max) assert(0);
1851         arr = realloc(arr, arr.length+1);
1852         arr[$-1] = force!T(value);
1853     }
1854 
1855     pure @safe unittest
1856     {
1857         int[] arr;
1858         ReallocPolicy.append(arr, 3);
1859 
1860         import std.algorithm.comparison : equal;
1861         assert(equal(arr, [3]));
1862     }
1863 
1864     static void append(T, V)(ref T[] arr, V value)
1865         if (isInputRange!V && hasLength!V)
1866     {
1867         import core.checkedint : addu;
1868         bool overflow;
1869         size_t nelems = addu(arr.length, value.length, overflow);
1870         if (overflow) assert(0);
1871 
1872         arr = realloc(arr, nelems);
1873 
1874         import std.algorithm.mutation : copy;
1875         copy(value, arr[$-value.length..$]);
1876     }
1877 
1878     pure @safe unittest
1879     {
1880         int[] arr;
1881         ReallocPolicy.append(arr, [1,2,3]);
1882 
1883         import std.algorithm.comparison : equal;
1884         assert(equal(arr, [1,2,3]));
1885     }
1886 
1887     static void destroy(T)(scope ref T[] arr) @trusted
1888     {
1889         import core.memory : pureFree;
1890         if (arr.ptr)
1891             pureFree(arr.ptr);
1892         arr = null;
1893     }
1894 }
1895 
1896 //build hack
1897 alias _RealArray = CowArray!ReallocPolicy;
1898 
1899 pure @safe unittest
1900 {
1901     import std.algorithm.comparison : equal;
1902 
1903     with(ReallocPolicy)
1904     {
1905         bool test(T, U, V)(T orig, size_t from, size_t to, U toReplace, V result,
1906                    string file = __FILE__, size_t line = __LINE__)
1907         {
1908             {
1909                 replaceImpl(orig, from, to, toReplace);
1910                 scope(exit) destroy(orig);
1911                 if (!equal(orig, result))
1912                     return false;
1913             }
1914             return true;
1915         }
1916         static T[] arr(T)(T[] args... )
1917         {
1918             return dup(args);
1919         }
1920 
1921         assert(test(arr([1, 2, 3, 4]), 0, 0, [5, 6, 7], [5, 6, 7, 1, 2, 3, 4]));
1922         assert(test(arr([1, 2, 3, 4]), 0, 2, cast(int[])[], [3, 4]));
1923         assert(test(arr([1, 2, 3, 4]), 0, 4, [5, 6, 7], [5, 6, 7]));
1924         assert(test(arr([1, 2, 3, 4]), 0, 2, [5, 6, 7], [5, 6, 7, 3, 4]));
1925         assert(test(arr([1, 2, 3, 4]), 2, 3, [5, 6, 7], [1, 2, 5, 6, 7, 4]));
1926     }
1927 }
1928 
1929 /**
1930     Tests if T is some kind a set of code points. Intended for template constraints.
1931 */
1932 public template isCodepointSet(T)
1933 {
1934     static if (is(T dummy == InversionList!(Args), Args...))
1935         enum isCodepointSet = true;
1936     else
1937         enum isCodepointSet = false;
1938 }
1939 
1940 /**
1941     Tests if `T` is a pair of integers that implicitly convert to `V`.
1942     The following code must compile for any pair `T`:
1943     ---
1944     (T x){ V a = x[0]; V b = x[1];}
1945     ---
1946     The following must not compile:
1947      ---
1948     (T x){ V c = x[2];}
1949     ---
1950 */
1951 public template isIntegralPair(T, V=uint)
1952 {
1953     enum isIntegralPair = is(typeof((T x){ V a = x[0]; V b = x[1];}))
1954         && !is(typeof((T x){ V c = x[2]; }));
1955 }
1956 
1957 
1958 /**
1959     The recommended default type for set of $(CODEPOINTS).
1960     For details, see the current implementation: $(LREF InversionList).
1961 */
1962 public alias CodepointSet = InversionList!GcPolicy;
1963 
1964 
1965 //@@@BUG: std.typecons tuples depend on std.format to produce fields mixin
1966 // which relies on std.uni.isGraphical and this chain blows up with Forward reference error
1967 // hence below doesn't seem to work
1968 // public alias CodepointInterval = Tuple!(uint, "a", uint, "b");
1969 
1970 /**
1971     The recommended type of $(REF Tuple, std,_typecons)
1972     to represent [a, b$(RPAREN) intervals of $(CODEPOINTS). As used in $(LREF InversionList).
1973     Any interval type should pass $(LREF isIntegralPair) trait.
1974 */
1975 public struct CodepointInterval
1976 {
1977 pure:
1978     uint[2] _tuple;
1979     alias _tuple this;
1980 
1981 @safe pure nothrow @nogc:
1982 
1983     this(uint low, uint high)
1984     {
1985         _tuple[0] = low;
1986         _tuple[1] = high;
1987     }
1988     bool opEquals(T)(T val) const
1989     {
1990         return this[0] == val[0] && this[1] == val[1];
1991     }
1992     @property ref inout(uint) a() return inout { return _tuple[0]; }
1993     @property ref inout(uint) b() return inout { return _tuple[1]; }
1994 }
1995 
1996 /**
1997     $(P
1998     `InversionList` is a set of $(CODEPOINTS)
1999     represented as an array of open-right [a, b$(RPAREN)
2000     intervals (see $(LREF CodepointInterval) above).
2001     The name comes from the way the representation reads left to right.
2002     For instance a set of all values [10, 50$(RPAREN), [80, 90$(RPAREN),
2003     plus a singular value 60 looks like this:
2004     )
2005     ---
2006     10, 50, 60, 61, 80, 90
2007     ---
2008     $(P
2009     The way to read this is: start with negative meaning that all numbers
2010     smaller then the next one are not present in this set (and positive -
2011     the contrary). Then switch positive/negative after each
2012     number passed from left to right.
2013     )
2014     $(P This way negative spans until 10, then positive until 50,
2015     then negative until 60, then positive until 61, and so on.
2016     As seen this provides a space-efficient storage of highly redundant data
2017     that comes in long runs. A description which Unicode $(CHARACTER)
2018     properties fit nicely. The technique itself could be seen as a variation
2019     on $(LINK2 https://en.wikipedia.org/wiki/Run-length_encoding, RLE encoding).
2020     )
2021 
2022     $(P Sets are value types (just like `int` is) thus they
2023         are never aliased.
2024     )
2025         Example:
2026         ---
2027         auto a = CodepointSet('a', 'z'+1);
2028         auto b = CodepointSet('A', 'Z'+1);
2029         auto c = a;
2030         a = a | b;
2031         assert(a == CodepointSet('A', 'Z'+1, 'a', 'z'+1));
2032         assert(a != c);
2033         ---
2034     $(P See also $(LREF unicode) for simpler construction of sets
2035         from predefined ones.
2036     )
2037 
2038     $(P Memory usage is 8 bytes per each contiguous interval in a set.
2039     The value semantics are achieved by using the
2040     $(HTTP en.wikipedia.org/wiki/Copy-on-write, COW) technique
2041     and thus it's $(RED not) safe to cast this type to $(D_KEYWORD shared).
2042     )
2043 
2044     Note:
2045     $(P It's not recommended to rely on the template parameters
2046     or the exact type of a current $(CODEPOINT) set in `std.uni`.
2047     The type and parameters may change when the standard
2048     allocators design is finalized.
2049     Use $(LREF isCodepointSet) with templates or just stick with the default
2050     alias $(LREF CodepointSet) throughout the whole code base.
2051     )
2052 */
2053 public struct InversionList(SP=GcPolicy)
2054 {
2055     import std.range : assumeSorted;
2056 
2057     /**
2058         Construct from another code point set of any type.
2059     */
2060     this(Set)(Set set) pure
2061         if (isCodepointSet!Set)
2062     {
2063         uint[] arr;
2064         foreach (v; set.byInterval)
2065         {
2066             arr ~= v.a;
2067             arr ~= v.b;
2068         }
2069         data = CowArray!(SP).reuse(arr);
2070     }
2071 
2072     /**
2073         Construct a set from a forward range of code point intervals.
2074     */
2075     this(Range)(Range intervals) pure
2076         if (isForwardRange!Range && isIntegralPair!(ElementType!Range))
2077     {
2078         uint[] arr;
2079         foreach (v; intervals)
2080         {
2081             SP.append(arr, v.a);
2082             SP.append(arr, v.b);
2083         }
2084         data = CowArray!(SP).reuse(arr);
2085         sanitize(); //enforce invariant: sort intervals etc.
2086     }
2087 
2088     //helper function that avoids sanity check to be CTFE-friendly
2089     private static fromIntervals(Range)(Range intervals) pure
2090     {
2091         import std.algorithm.iteration : map;
2092         import std.range : roundRobin;
2093         auto flattened = roundRobin(intervals.save.map!"a[0]"(),
2094             intervals.save.map!"a[1]"());
2095         InversionList set;
2096         set.data = CowArray!(SP)(flattened);
2097         return set;
2098     }
2099     //ditto untill sort is CTFE-able
2100     private static fromIntervals()(uint[] intervals...) pure
2101     in
2102     {
2103         import std.conv : text;
2104         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2105         for (uint i = 0; i < intervals.length; i += 2)
2106         {
2107             auto a = intervals[i], b = intervals[i+1];
2108             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2109         }
2110     }
2111     do
2112     {
2113         InversionList set;
2114         set.data = CowArray!(SP)(intervals);
2115         return set;
2116     }
2117 
2118     /**
2119         Construct a set from plain values of code point intervals.
2120     */
2121     this()(uint[] intervals...)
2122     in
2123     {
2124         import std.conv : text;
2125         assert(intervals.length % 2 == 0, "Odd number of interval bounds [a, b)!");
2126         for (uint i = 0; i < intervals.length; i += 2)
2127         {
2128             auto a = intervals[i], b = intervals[i+1];
2129             assert(a < b, text("illegal interval [a, b): ", a, " > ", b));
2130         }
2131     }
2132     do
2133     {
2134         data = CowArray!(SP)(intervals);
2135         sanitize(); //enforce invariant: sort intervals etc.
2136     }
2137 
2138     ///
2139     pure @safe unittest
2140     {
2141         import std.algorithm.comparison : equal;
2142 
2143         auto set = CodepointSet('a', 'z'+1, 'а', 'я'+1);
2144         foreach (v; 'a'..'z'+1)
2145             assert(set[v]);
2146         // Cyrillic lowercase interval
2147         foreach (v; 'а'..'я'+1)
2148             assert(set[v]);
2149         //specific order is not required, intervals may interesect
2150         auto set2 = CodepointSet('а', 'я'+1, 'a', 'd', 'b', 'z'+1);
2151         //the same end result
2152         assert(set2.byInterval.equal(set.byInterval));
2153         // test constructor this(Range)(Range intervals)
2154         auto chessPiecesWhite = CodepointInterval(9812, 9818);
2155         auto chessPiecesBlack = CodepointInterval(9818, 9824);
2156         auto set3 = CodepointSet([chessPiecesWhite, chessPiecesBlack]);
2157         foreach (v; '♔'..'♟'+1)
2158             assert(set3[v]);
2159     }
2160 
2161     /**
2162         Get range that spans all of the $(CODEPOINT) intervals in this $(LREF InversionList).
2163     */
2164     @property auto byInterval() scope
2165     {
2166         // TODO: change this to data[] once the -dip1000 errors have been fixed
2167         // see e.g. https://github.com/dlang/phobos/pull/6638
2168         import std.array : array;
2169         return Intervals!(typeof(data.array))(data.array);
2170     }
2171 
2172     @safe unittest
2173     {
2174         import std.algorithm.comparison : equal;
2175         import std.typecons : tuple;
2176 
2177         auto set = CodepointSet('A', 'D'+1, 'a', 'd'+1);
2178 
2179         assert(set.byInterval.equal([tuple('A','E'), tuple('a','e')]));
2180     }
2181 
2182     package(std) @property const(CodepointInterval)[] intervals() const
2183     {
2184         import std.array : array;
2185         return Intervals!(typeof(data[]))(data[]).array;
2186     }
2187 
2188     /**
2189         Tests the presence of code point `val` in this set.
2190     */
2191     bool opIndex(uint val) const
2192     {
2193         // the <= ensures that searching in  interval of [a, b) for 'a' you get .length == 1
2194         // return assumeSorted!((a,b) => a <= b)(data[]).lowerBound(val).length & 1;
2195         return sharSwitchLowerBound!"a <= b"(data[], val) & 1;
2196     }
2197 
2198     ///
2199     pure @safe unittest
2200     {
2201         auto gothic = unicode.Gothic;
2202         // Gothic letter ahsa
2203         assert(gothic['\U00010330']);
2204         // no ascii in Gothic obviously
2205         assert(!gothic['$']);
2206     }
2207 
2208 
2209     // Linear scan for `ch`. Useful only for small sets.
2210     // TODO:
2211     // used internally in std.regex
2212     // should be properly exposed in a public API ?
2213     package(std) auto scanFor()(dchar ch) const
2214     {
2215         immutable len = data.length;
2216         for (size_t i = 0; i < len; i++)
2217             if (ch < data[i])
2218                 return i & 1;
2219         return 0;
2220     }
2221 
2222     /// Number of $(CODEPOINTS) in this set
2223     @property size_t length()
2224     {
2225         size_t sum = 0;
2226         foreach (iv; byInterval)
2227         {
2228             sum += iv.b - iv.a;
2229         }
2230         return sum;
2231     }
2232 
2233 // bootstrap full set operations from 4 primitives (suitable as a template mixin):
2234 // addInterval, skipUpTo, dropUpTo & byInterval iteration
2235 //============================================================================
2236 public:
2237     /**
2238         $(P Sets support natural syntax for set algebra, namely: )
2239         $(BOOKTABLE ,
2240             $(TR $(TH Operator) $(TH Math notation) $(TH Description) )
2241             $(TR $(TD &) $(TD a ∩ b) $(TD intersection) )
2242             $(TR $(TD |) $(TD a ∪ b) $(TD union) )
2243             $(TR $(TD -) $(TD a ∖ b) $(TD subtraction) )
2244             $(TR $(TD ~) $(TD a ~ b) $(TD symmetric set difference i.e. (a ∪ b) \ (a ∩ b)) )
2245         )
2246     */
2247     This opBinary(string op, U)(U rhs)
2248         if (isCodepointSet!U || is(U:dchar))
2249     {
2250         static if (op == "&" || op == "|" || op == "~")
2251         {// symmetric ops thus can swap arguments to reuse r-value
2252             static if (is(U:dchar))
2253             {
2254                 auto tmp = this;
2255                 mixin("tmp "~op~"= rhs; ");
2256                 return tmp;
2257             }
2258             else
2259             {
2260                 static if (is(Unqual!U == U))
2261                 {
2262                     // try hard to reuse r-value
2263                     mixin("rhs "~op~"= this;");
2264                     return rhs;
2265                 }
2266                 else
2267                 {
2268                     auto tmp = this;
2269                     mixin("tmp "~op~"= rhs;");
2270                     return tmp;
2271                 }
2272             }
2273         }
2274         else static if (op == "-") // anti-symmetric
2275         {
2276             auto tmp = this;
2277             tmp -= rhs;
2278             return tmp;
2279         }
2280         else
2281             static assert(0, "no operator "~op~" defined for Set");
2282     }
2283 
2284     ///
2285     pure @safe unittest
2286     {
2287         import std.algorithm.comparison : equal;
2288         import std.range : iota;
2289 
2290         auto lower = unicode.LowerCase;
2291         auto upper = unicode.UpperCase;
2292         auto ascii = unicode.ASCII;
2293 
2294         assert((lower & upper).empty); // no intersection
2295         auto lowerASCII = lower & ascii;
2296         assert(lowerASCII.byCodepoint.equal(iota('a', 'z'+1)));
2297         // throw away all of the lowercase ASCII
2298         assert((ascii - lower).length == 128 - 26);
2299 
2300         auto onlyOneOf = lower ~ ascii;
2301         assert(!onlyOneOf['Δ']); // not ASCII and not lowercase
2302         assert(onlyOneOf['$']); // ASCII and not lowercase
2303         assert(!onlyOneOf['a']); // ASCII and lowercase
2304         assert(onlyOneOf['я']); // not ASCII but lowercase
2305 
2306         // throw away all cased letters from ASCII
2307         auto noLetters = ascii - (lower | upper);
2308         assert(noLetters.length == 128 - 26*2);
2309     }
2310 
2311     /// The 'op=' versions of the above overloaded operators.
2312     ref This opOpAssign(string op, U)(U rhs)
2313         if (isCodepointSet!U || is(U:dchar))
2314     {
2315         static if (op == "|")    // union
2316         {
2317             static if (is(U:dchar))
2318             {
2319                 this.addInterval(rhs, rhs+1);
2320                 return this;
2321             }
2322             else
2323                 return this.add(rhs);
2324         }
2325         else static if (op == "&")   // intersection
2326                 return this.intersect(rhs);// overloaded
2327         else static if (op == "-")   // set difference
2328                 return this.sub(rhs);// overloaded
2329         else static if (op == "~")   // symmetric set difference
2330         {
2331             auto copy = this & rhs;
2332             this |= rhs;
2333             this -= copy;
2334             return this;
2335         }
2336         else
2337             static assert(0, "no operator "~op~" defined for Set");
2338     }
2339 
2340     /**
2341         Tests the presence of codepoint `ch` in this set,
2342         the same as $(LREF opIndex).
2343     */
2344     bool opBinaryRight(string op: "in", U)(U ch) const
2345         if (is(U : dchar))
2346     {
2347         return this[ch];
2348     }
2349 
2350     ///
2351     pure @safe unittest
2352     {
2353         assert('я' in unicode.Cyrillic);
2354         assert(!('z' in unicode.Cyrillic));
2355     }
2356 
2357 
2358 
2359     /**
2360      * Obtains a set that is the inversion of this set.
2361      *
2362      * See_Also: $(LREF inverted)
2363      */
2364     auto opUnary(string op: "!")()
2365     {
2366         return this.inverted;
2367     }
2368 
2369     /**
2370         A range that spans each $(CODEPOINT) in this set.
2371     */
2372     @property auto byCodepoint()
2373     {
2374         static struct CodepointRange
2375         {
2376             this(This set)
2377             {
2378                 r = set.byInterval;
2379                 if (!r.empty)
2380                     cur = r.front.a;
2381             }
2382 
2383             @property dchar front() const
2384             {
2385                 return cast(dchar) cur;
2386             }
2387 
2388             @property bool empty() const
2389             {
2390                 return r.empty;
2391             }
2392 
2393             void popFront()
2394             {
2395                 cur++;
2396                 while (cur >= r.front.b)
2397                 {
2398                     r.popFront();
2399                     if (r.empty)
2400                         break;
2401                     cur = r.front.a;
2402                 }
2403             }
2404         private:
2405             uint cur;
2406             typeof(This.init.byInterval) r;
2407         }
2408 
2409         return CodepointRange(this);
2410     }
2411 
2412     ///
2413     pure @safe unittest
2414     {
2415         import std.algorithm.comparison : equal;
2416         import std.range : iota;
2417 
2418         auto set = unicode.ASCII;
2419         set.byCodepoint.equal(iota(0, 0x80));
2420     }
2421 
2422     /**
2423         $(P Obtain textual representation of this set in from of
2424         open-right intervals and feed it to `sink`.
2425         )
2426         $(P Used by various standard formatting facilities such as
2427          $(REF formattedWrite, std,format), $(REF write, std,stdio),
2428          $(REF writef, std,stdio), $(REF to, std,conv) and others.
2429         )
2430         Example:
2431         ---
2432         import std.conv;
2433         assert(unicode.ASCII.to!string == "[0..128$(RPAREN)");
2434         ---
2435     */
2436 
2437     private import std.format.spec : FormatSpec;
2438 
2439     /***************************************
2440      * Obtain a textual representation of this InversionList
2441      * in form of open-right intervals.
2442      *
2443      * The formatting flag is applied individually to each value, for example:
2444      * $(LI $(B %s) and $(B %d) format the intervals as a [low .. high$(RPAREN) range of integrals)
2445      * $(LI $(B %x) formats the intervals as a [low .. high$(RPAREN) range of lowercase hex characters)
2446      * $(LI $(B %X) formats the intervals as a [low .. high$(RPAREN) range of uppercase hex characters)
2447      */
2448     void toString(Writer)(scope Writer sink, scope const ref FormatSpec!char fmt) /* const */
2449     {
2450         import std.format.write : formatValue;
2451         auto range = byInterval;
2452         if (range.empty)
2453             return;
2454 
2455         while (1)
2456         {
2457             auto i = range.front;
2458             range.popFront();
2459 
2460             put(sink, "[");
2461             formatValue(sink, i.a, fmt);
2462             put(sink, "..");
2463             formatValue(sink, i.b, fmt);
2464             put(sink, ")");
2465             if (range.empty) return;
2466             put(sink, " ");
2467         }
2468     }
2469 
2470     ///
2471     pure @safe unittest
2472     {
2473         import std.conv : to;
2474         import std.format : format;
2475         import std.uni : unicode;
2476 
2477         // This was originally using Cyrillic script.
2478         // Unfortunately this is a pretty active range for changes,
2479         // and hence broke in an update.
2480         // Therefore the range Basic latin was used instead as it
2481         // unlikely to ever change.
2482 
2483         assert(unicode.InBasic_latin.to!string == "[0..128)");
2484 
2485         // The specs '%s' and '%d' are equivalent to the to!string call above.
2486         assert(format("%d", unicode.InBasic_latin) == unicode.InBasic_latin.to!string);
2487 
2488         assert(format("%#x", unicode.InBasic_latin) == "[0..0x80)");
2489         assert(format("%#X", unicode.InBasic_latin) == "[0..0X80)");
2490     }
2491 
2492     pure @safe unittest
2493     {
2494         import std.exception : assertThrown;
2495         import std.format : format, FormatException;
2496         assertThrown!FormatException(format("%z", unicode.ASCII));
2497     }
2498 
2499 
2500     /**
2501         Add an interval [a, b$(RPAREN) to this set.
2502     */
2503     ref add()(uint a, uint b)
2504     {
2505         addInterval(a, b);
2506         return this;
2507     }
2508 
2509     ///
2510     pure @safe unittest
2511     {
2512         CodepointSet someSet;
2513         someSet.add('0', '5').add('A','Z'+1);
2514         someSet.add('5', '9'+1);
2515         assert(someSet['0']);
2516         assert(someSet['5']);
2517         assert(someSet['9']);
2518         assert(someSet['Z']);
2519     }
2520 
2521 private:
2522 
2523   package(std)  // used from: std.regex.internal.parser
2524     ref intersect(U)(U rhs)
2525         if (isCodepointSet!U)
2526     {
2527         Marker mark;
2528         foreach ( i; rhs.byInterval)
2529         {
2530             mark = this.dropUpTo(i.a, mark);
2531             mark = this.skipUpTo(i.b, mark);
2532         }
2533         this.dropUpTo(uint.max, mark);
2534         return this;
2535     }
2536 
2537     ref intersect()(dchar ch)
2538     {
2539         foreach (i; byInterval)
2540             if (i.a <= ch && ch < i.b)
2541                 return this = This.init.add(ch, ch+1);
2542         this = This.init;
2543         return this;
2544     }
2545 
2546     pure @safe unittest
2547     {
2548         assert(unicode.Cyrillic.intersect('-').byInterval.empty);
2549     }
2550 
2551     ref sub()(dchar ch)
2552     {
2553         return subChar(ch);
2554     }
2555 
2556     // same as the above except that skip & drop parts are swapped
2557   package(std)  // used from: std.regex.internal.parser
2558     ref sub(U)(U rhs)
2559         if (isCodepointSet!U)
2560     {
2561         Marker mark;
2562         foreach (i; rhs.byInterval)
2563         {
2564             mark = this.skipUpTo(i.a, mark);
2565             mark = this.dropUpTo(i.b, mark);
2566         }
2567         return this;
2568     }
2569 
2570   package(std)  // used from: std.regex.internal.parse
2571     ref add(U)(U rhs)
2572         if (isCodepointSet!U)
2573     {
2574         Marker start;
2575         foreach (i; rhs.byInterval)
2576         {
2577             start = addInterval(i.a, i.b, start);
2578         }
2579         return this;
2580     }
2581 
2582 // end of mixin-able part
2583 //============================================================================
2584 public:
2585     /**
2586         Obtains a set that is the inversion of this set.
2587 
2588         See the '!' $(LREF opUnary) for the same but using operators.
2589     */
2590     @property auto inverted()
2591     {
2592         InversionList inversion = this;
2593         if (inversion.data.length == 0)
2594         {
2595             inversion.addInterval(0, lastDchar+1);
2596             return inversion;
2597         }
2598         if (inversion.data[0] != 0)
2599             genericReplace(inversion.data, 0, 0, [0]);
2600         else
2601             genericReplace(inversion.data, 0, 1, cast(uint[]) null);
2602         if (data[data.length-1] != lastDchar+1)
2603             genericReplace(inversion.data,
2604                 inversion.data.length, inversion.data.length, [lastDchar+1]);
2605         else
2606             genericReplace(inversion.data,
2607                 inversion.data.length-1, inversion.data.length, cast(uint[]) null);
2608 
2609         return inversion;
2610     }
2611 
2612     ///
2613     pure @safe unittest
2614     {
2615         auto set = unicode.ASCII;
2616         // union with the inverse gets all of the code points in the Unicode
2617         assert((set | set.inverted).length == 0x110000);
2618         // no intersection with the inverse
2619         assert((set & set.inverted).empty);
2620     }
2621 
2622     package(std) static string toSourceCode(const(CodepointInterval)[] range, string funcName)
2623     {
2624         import std.algorithm.searching : countUntil;
2625         import std.format : format;
2626         enum maxBinary = 3;
2627         static string linearScope(R)(R ivals, string indent)
2628         {
2629             string result = indent~"{\n";
2630             string deeper = indent~"    ";
2631             foreach (ival; ivals)
2632             {
2633                 immutable span = ival[1] - ival[0];
2634                 assert(span != 0);
2635                 if (span == 1)
2636                 {
2637                     result ~= format("%sif (ch == %s) return true;\n", deeper, ival[0]);
2638                 }
2639                 else if (span == 2)
2640                 {
2641                     result ~= format("%sif (ch == %s || ch == %s) return true;\n",
2642                         deeper, ival[0], ival[0]+1);
2643                 }
2644                 else
2645                 {
2646                     if (ival[0] != 0) // dchar is unsigned and  < 0 is useless
2647                         result ~= format("%sif (ch < %s) return false;\n", deeper, ival[0]);
2648                     result ~= format("%sif (ch < %s) return true;\n", deeper, ival[1]);
2649                 }
2650             }
2651             result ~= format("%sreturn false;\n%s}\n", deeper, indent); // including empty range of intervals
2652             return result;
2653         }
2654 
2655         static string binaryScope(R)(R ivals, string indent) @safe
2656         {
2657             // time to do unrolled comparisons?
2658             if (ivals.length < maxBinary)
2659                 return linearScope(ivals, indent);
2660             else
2661                 return bisect(ivals, ivals.length/2, indent);
2662         }
2663 
2664         // not used yet if/elsebinary search is far better with DMD  as of 2.061
2665         // and GDC is doing fine job either way
2666         static string switchScope(R)(R ivals, string indent)
2667         {
2668             string result = indent~"switch (ch){\n";
2669             string deeper = indent~"    ";
2670             foreach (ival; ivals)
2671             {
2672                 if (ival[0]+1 == ival[1])
2673                 {
2674                     result ~= format("%scase %s: return true;\n",
2675                         deeper, ival[0]);
2676                 }
2677                 else
2678                 {
2679                     result ~= format("%scase %s: .. case %s: return true;\n",
2680                          deeper, ival[0], ival[1]-1);
2681                 }
2682             }
2683             result ~= deeper~"default: return false;\n"~indent~"}\n";
2684             return result;
2685         }
2686 
2687         static string bisect(R)(R range, size_t idx, string indent)
2688         {
2689             string deeper = indent ~ "    ";
2690             // bisect on one [a, b) interval at idx
2691             string result = indent~"{\n";
2692             // less branch, < a
2693             result ~= format("%sif (ch < %s)\n%s",
2694                 deeper, range[idx][0], binaryScope(range[0 .. idx], deeper));
2695             // middle point,  >= a && < b
2696             result ~= format("%selse if (ch < %s) return true;\n",
2697                 deeper, range[idx][1]);
2698             // greater or equal branch,  >= b
2699             result ~= format("%selse\n%s",
2700                 deeper, binaryScope(range[idx+1..$], deeper));
2701             return result~indent~"}\n";
2702         }
2703 
2704         string code = format("bool %s(dchar ch) @safe pure nothrow @nogc\n",
2705             funcName.empty ? "function" : funcName);
2706         // special case first bisection to be on ASCII vs beyond
2707         auto tillAscii = countUntil!"a[0] > 0x80"(range);
2708         if (tillAscii <= 0) // everything is ASCII or nothing is ascii (-1 & 0)
2709             code ~= binaryScope(range, "");
2710         else
2711             code ~= bisect(range, tillAscii, "");
2712         return code;
2713     }
2714 
2715     /**
2716         Generates string with D source code of unary function with name of
2717         `funcName` taking a single `dchar` argument. If `funcName` is empty
2718         the code is adjusted to be a lambda function.
2719 
2720         The function generated tests if the $(CODEPOINT) passed
2721         belongs to this set or not. The result is to be used with string mixin.
2722         The intended usage area is aggressive optimization via meta programming
2723         in parser generators and the like.
2724 
2725         Note: Use with care for relatively small or regular sets. It
2726         could end up being slower then just using multi-staged tables.
2727 
2728         Example:
2729         ---
2730         import std.stdio;
2731 
2732         // construct set directly from [a, b$RPAREN intervals
2733         auto set = CodepointSet(10, 12, 45, 65, 100, 200);
2734         writeln(set);
2735         writeln(set.toSourceCode("func"));
2736         ---
2737 
2738         The above outputs something along the lines of:
2739         ---
2740         bool func(dchar ch)  @safe pure nothrow @nogc
2741         {
2742             if (ch < 45)
2743             {
2744                 if (ch == 10 || ch == 11) return true;
2745                 return false;
2746             }
2747             else if (ch < 65) return true;
2748             else
2749             {
2750                 if (ch < 100) return false;
2751                 if (ch < 200) return true;
2752                 return false;
2753             }
2754         }
2755         ---
2756     */
2757     string toSourceCode(string funcName="")
2758     {
2759         import std.array : array;
2760         auto range = byInterval.array();
2761         return toSourceCode(range, funcName);
2762     }
2763 
2764     /**
2765         True if this set doesn't contain any $(CODEPOINTS).
2766     */
2767     @property bool empty() const
2768     {
2769         return data.length == 0;
2770     }
2771 
2772     ///
2773     pure @safe unittest
2774     {
2775         CodepointSet emptySet;
2776         assert(emptySet.length == 0);
2777         assert(emptySet.empty);
2778     }
2779 
2780 private:
2781     alias This = typeof(this);
2782     alias Marker = size_t;
2783 
2784     // a random-access range of integral pairs
2785     static struct Intervals(Range)
2786     {
2787         import std.range.primitives : hasAssignableElements;
2788 
2789         this(Range sp) scope
2790         {
2791             slice = sp;
2792             start = 0;
2793             end = sp.length;
2794         }
2795 
2796         this(Range sp, size_t s, size_t e) scope
2797         {
2798             slice = sp;
2799             start = s;
2800             end = e;
2801         }
2802 
2803         @property auto front()const
2804         {
2805             immutable a = slice[start];
2806             immutable b = slice[start+1];
2807             return CodepointInterval(a, b);
2808         }
2809 
2810         //may break sorted property - but we need std.sort to access it
2811         //hence package(std) protection attribute
2812         static if (hasAssignableElements!Range)
2813         package(std) @property void front(CodepointInterval val)
2814         {
2815             slice[start] = val.a;
2816             slice[start+1] = val.b;
2817         }
2818 
2819         @property auto back()const
2820         {
2821             immutable a = slice[end-2];
2822             immutable b = slice[end-1];
2823             return CodepointInterval(a, b);
2824         }
2825 
2826         //ditto about package
2827         static if (hasAssignableElements!Range)
2828         package(std) @property void back(CodepointInterval val)
2829         {
2830             slice[end-2] = val.a;
2831             slice[end-1] = val.b;
2832         }
2833 
2834         void popFront()
2835         {
2836             start += 2;
2837         }
2838 
2839         void popBack()
2840         {
2841             end -= 2;
2842         }
2843 
2844         auto opIndex(size_t idx) const
2845         {
2846             immutable a = slice[start+idx*2];
2847             immutable b = slice[start+idx*2+1];
2848             return CodepointInterval(a, b);
2849         }
2850 
2851         //ditto about package
2852         static if (hasAssignableElements!Range)
2853         package(std) void opIndexAssign(CodepointInterval val, size_t idx)
2854         {
2855             slice[start+idx*2] = val.a;
2856             slice[start+idx*2+1] = val.b;
2857         }
2858 
2859         auto opSlice(size_t s, size_t e)
2860         {
2861             return Intervals(slice, s*2+start, e*2+start);
2862         }
2863 
2864         @property size_t length()const {  return slice.length/2; }
2865 
2866         @property bool empty()const { return start == end; }
2867 
2868         @property auto save(){ return this; }
2869     private:
2870         size_t start, end;
2871         Range slice;
2872     }
2873 
2874     // called after construction from intervals
2875     // to make sure invariants hold
2876     void sanitize()
2877     {
2878         import std.algorithm.comparison : max;
2879         import std.algorithm.mutation : SwapStrategy;
2880         import std.algorithm.sorting : sort;
2881         if (data.length == 0)
2882             return;
2883         alias Ival = CodepointInterval;
2884         //intervals wrapper for a _range_ over packed array
2885         auto ivals = Intervals!(typeof(data[]))(data[]);
2886         //@@@BUG@@@ can't use "a.a < b.a" see
2887         // https://issues.dlang.org/show_bug.cgi?id=12265
2888         sort!((a,b) => a.a < b.a, SwapStrategy.stable)(ivals);
2889         // what follows is a variation on stable remove
2890         // differences:
2891         // - predicate is binary, and is tested against
2892         //   the last kept element (at 'i').
2893         // - predicate mutates lhs (merges rhs into lhs)
2894         size_t len = ivals.length;
2895         size_t i = 0;
2896         size_t j = 1;
2897         while (j < len)
2898         {
2899             if (ivals[i].b >= ivals[j].a)
2900             {
2901                 ivals[i] = Ival(ivals[i].a, max(ivals[i].b, ivals[j].b));
2902                 j++;
2903             }
2904             else //unmergable
2905             {
2906                 // check if there is a hole after merges
2907                 // (in the best case we do 0 writes to ivals)
2908                 if (j != i+1)
2909                     ivals[i+1] = ivals[j]; //copy over
2910                 i++;
2911                 j++;
2912             }
2913         }
2914         len = i + 1;
2915         for (size_t k=0; k + 1 < len; k++)
2916         {
2917             assert(ivals[k].a < ivals[k].b);
2918             assert(ivals[k].b < ivals[k+1].a);
2919         }
2920         data.length = len * 2;
2921     }
2922 
2923     // special case for normal InversionList
2924     ref subChar(dchar ch)
2925     {
2926         auto mark = skipUpTo(ch);
2927         if (mark != data.length
2928             && data[mark] == ch && data[mark-1] == ch)
2929         {
2930             // it has split, meaning that ch happens to be in one of intervals
2931             data[mark] = data[mark]+1;
2932         }
2933         return this;
2934     }
2935 
2936     //
2937     Marker addInterval(int a, int b, Marker hint=Marker.init) scope
2938     in
2939     {
2940         assert(a <= b);
2941     }
2942     do
2943     {
2944         import std.range : assumeSorted, SearchPolicy;
2945         auto range = assumeSorted(data[]);
2946         size_t pos;
2947         size_t a_idx = hint + range[hint..$].lowerBound!(SearchPolicy.gallop)(a).length;
2948         if (a_idx == range.length)
2949         {
2950             //  [---+++----++++----++++++]
2951             //  [                         a  b]
2952             data.append(a, b);
2953             return data.length-1;
2954         }
2955         size_t b_idx = range[a_idx .. range.length].lowerBound!(SearchPolicy.gallop)(b).length+a_idx;
2956         uint[3] buf = void;
2957         uint to_insert;
2958         debug(std_uni)
2959         {
2960             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2961         }
2962         if (b_idx == range.length)
2963         {
2964             //  [-------++++++++----++++++-]
2965             //  [      s     a                 b]
2966             if (a_idx & 1)// a in positive
2967             {
2968                 buf[0] = b;
2969                 to_insert = 1;
2970             }
2971             else// a in negative
2972             {
2973                 buf[0] = a;
2974                 buf[1] = b;
2975                 to_insert = 2;
2976             }
2977             pos = genericReplace(data, a_idx, b_idx, buf[0 .. to_insert]);
2978             return pos - 1;
2979         }
2980 
2981         uint top = data[b_idx];
2982 
2983         debug(std_uni)
2984         {
2985             writefln("a_idx=%d; b_idx=%d;", a_idx, b_idx);
2986             writefln("a=%s; b=%s; top=%s;", a, b, top);
2987         }
2988         if (a_idx & 1)
2989         {// a in positive
2990             if (b_idx & 1)// b in positive
2991             {
2992                 //  [-------++++++++----++++++-]
2993                 //  [       s    a        b    ]
2994                 buf[0] = top;
2995                 to_insert = 1;
2996             }
2997             else // b in negative
2998             {
2999                 //  [-------++++++++----++++++-]
3000                 //  [       s    a   b         ]
3001                 if (top == b)
3002                 {
3003                     assert(b_idx+1 < data.length);
3004                     buf[0] = data[b_idx+1];
3005                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 1]);
3006                     return pos - 1;
3007                 }
3008                 buf[0] = b;
3009                 buf[1] = top;
3010                 to_insert = 2;
3011             }
3012         }
3013         else
3014         { // a in negative
3015             if (b_idx & 1) // b in positive
3016             {
3017                 //  [----------+++++----++++++-]
3018                 //  [     a     b              ]
3019                 buf[0] = a;
3020                 buf[1] = top;
3021                 to_insert = 2;
3022             }
3023             else// b in negative
3024             {
3025                 //  [----------+++++----++++++-]
3026                 //  [  a       s      b        ]
3027                 if (top == b)
3028                 {
3029                     assert(b_idx+1 < data.length);
3030                     buf[0] = a;
3031                     buf[1] = data[b_idx+1];
3032                     pos = genericReplace(data, a_idx, b_idx+2, buf[0 .. 2]);
3033                     return pos - 1;
3034                 }
3035                 buf[0] = a;
3036                 buf[1] = b;
3037                 buf[2] = top;
3038                 to_insert = 3;
3039             }
3040         }
3041         pos = genericReplace(data, a_idx, b_idx+1, buf[0 .. to_insert]);
3042         debug(std_uni)
3043         {
3044             writefln("marker idx: %d; length=%d", pos, data[pos], data.length);
3045             writeln("inserting ", buf[0 .. to_insert]);
3046         }
3047         return pos - 1;
3048     }
3049 
3050     //
3051     Marker dropUpTo(uint a, Marker pos=Marker.init)
3052     in
3053     {
3054         assert(pos % 2 == 0); // at start of interval
3055     }
3056     do
3057     {
3058         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3059         if (range.empty)
3060             return pos;
3061         size_t idx = pos;
3062         idx += range.lowerBound(a).length;
3063 
3064         debug(std_uni)
3065         {
3066             writeln("dropUpTo full length=", data.length);
3067             writeln(pos,"~~~", idx);
3068         }
3069         if (idx == data.length)
3070             return genericReplace(data, pos, idx, cast(uint[])[]);
3071         if (idx & 1)
3072         {   // a in positive
3073             //[--+++----++++++----+++++++------...]
3074             //      |<---si       s  a  t
3075             genericReplace(data, pos, idx, [a]);
3076         }
3077         else
3078         {   // a in negative
3079             //[--+++----++++++----+++++++-------+++...]
3080             //      |<---si              s  a  t
3081             genericReplace(data, pos, idx, cast(uint[])[]);
3082         }
3083         return pos;
3084     }
3085 
3086     //
3087     Marker skipUpTo(uint a, Marker pos=Marker.init)
3088     out(result)
3089     {
3090         assert(result % 2 == 0);// always start of interval
3091         //(may be  0-width after-split)
3092     }
3093     do
3094     {
3095         assert(data.length % 2 == 0);
3096         auto range = assumeSorted!"a <= b"(data[pos .. data.length]);
3097         size_t idx = pos+range.lowerBound(a).length;
3098 
3099         if (idx >= data.length) // could have Marker point to recently removed stuff
3100             return data.length;
3101 
3102         if (idx & 1)// inside of interval, check for split
3103         {
3104 
3105             immutable top = data[idx];
3106             if (top == a)// no need to split, it's end
3107                 return idx+1;
3108             immutable start = data[idx-1];
3109             if (a == start)
3110                 return idx-1;
3111             // split it up
3112             genericReplace(data, idx, idx+1, [a, a, top]);
3113             return idx+1;        // avoid odd index
3114         }
3115         return idx;
3116     }
3117 
3118     CowArray!SP data;
3119 }
3120 
3121 pure @safe unittest
3122 {
3123     import std.conv : to;
3124     assert(unicode.ASCII.to!string() == "[0..128)");
3125 }
3126 
3127 // pedantic version for ctfe, and aligned-access only architectures
3128 @system private uint safeRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3129 {
3130     idx *= 3;
3131     version (LittleEndian)
3132         return ptr[idx] + (cast(uint) ptr[idx+1]<<8)
3133              + (cast(uint) ptr[idx+2]<<16);
3134     else
3135         return (cast(uint) ptr[idx]<<16) + (cast(uint) ptr[idx+1]<<8)
3136              + ptr[idx+2];
3137 }
3138 
3139 // ditto
3140 @system private void safeWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3141 {
3142     idx *= 3;
3143     version (LittleEndian)
3144     {
3145         ptr[idx] = val & 0xFF;
3146         ptr[idx+1] = (val >> 8) & 0xFF;
3147         ptr[idx+2] = (val >> 16) & 0xFF;
3148     }
3149     else
3150     {
3151         ptr[idx] = (val >> 16) & 0xFF;
3152         ptr[idx+1] = (val >> 8) & 0xFF;
3153         ptr[idx+2] = val & 0xFF;
3154     }
3155 }
3156 
3157 // unaligned x86-like read/write functions
3158 @system private uint unalignedRead24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3159 {
3160     uint* src = cast(uint*)(ptr+3*idx);
3161     version (LittleEndian)
3162         return *src & 0xFF_FFFF;
3163     else
3164         return *src >> 8;
3165 }
3166 
3167 // ditto
3168 @system private void unalignedWrite24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3169 {
3170     uint* dest = cast(uint*)(cast(ubyte*) ptr + 3*idx);
3171     version (LittleEndian)
3172         *dest = val | (*dest & 0xFF00_0000);
3173     else
3174         *dest = (val << 8) | (*dest & 0xFF);
3175 }
3176 
3177 @system private uint read24(scope const ubyte* ptr, size_t idx) pure nothrow @nogc
3178 {
3179     static if (hasUnalignedReads)
3180         return __ctfe ? safeRead24(ptr, idx) : unalignedRead24(ptr, idx);
3181     else
3182         return safeRead24(ptr, idx);
3183 }
3184 
3185 @system private void write24(scope ubyte* ptr, uint val, size_t idx) pure nothrow @nogc
3186 {
3187     static if (hasUnalignedReads)
3188         return __ctfe ? safeWrite24(ptr, val, idx) : unalignedWrite24(ptr, val, idx);
3189     else
3190         return safeWrite24(ptr, val, idx);
3191 }
3192 
3193 struct CowArray(SP=GcPolicy)
3194 {
3195     import std.range.primitives : hasLength;
3196 
3197   @safe:
3198     static auto reuse(uint[] arr)
3199     {
3200         CowArray cow;
3201         cow.data = arr;
3202         SP.append(cow.data, 1);
3203         assert(cow.refCount == 1);
3204         assert(cow.length == arr.length);
3205         return cow;
3206     }
3207 
3208     this(Range)(Range range)
3209         if (isInputRange!Range && hasLength!Range)
3210     {
3211         import std.algorithm.mutation : copy;
3212         length = range.length;
3213         copy(range, data[0..$-1]);
3214     }
3215 
3216     this(Range)(Range range)
3217         if (isForwardRange!Range && !hasLength!Range)
3218     {
3219         import std.algorithm.mutation : copy;
3220         import std.range.primitives : walkLength;
3221         immutable len = walkLength(range.save);
3222         length = len;
3223         copy(range, data[0..$-1]);
3224     }
3225 
3226     this(this)
3227     {
3228         if (!empty)
3229         {
3230             refCount = refCount + 1;
3231         }
3232     }
3233 
3234     ~this()
3235     {
3236         if (!empty)
3237         {
3238             immutable cnt = refCount;
3239             if (cnt == 1)
3240                 SP.destroy(data);
3241             else
3242                 refCount = cnt - 1;
3243         }
3244     }
3245 
3246     // no ref-count for empty U24 array
3247     @property bool empty() const { return data.length == 0; }
3248 
3249     // report one less then actual size
3250     @property size_t length() const
3251     {
3252         return data.length ? data.length - 1 : 0;
3253     }
3254 
3255     //+ an extra slot for ref-count
3256     @property void length(size_t len)
3257     {
3258         import std.algorithm.comparison : min;
3259         import std.algorithm.mutation : copy;
3260         if (len == 0)
3261         {
3262             if (!empty)
3263                 freeThisReference();
3264             return;
3265         }
3266         immutable total = len + 1; // including ref-count
3267         if (empty)
3268         {
3269             data = SP.alloc!uint(total);
3270             refCount = 1;
3271             return;
3272         }
3273         immutable cur_cnt = refCount;
3274         if (cur_cnt != 1) // have more references to this memory
3275         {
3276             refCount = cur_cnt - 1;
3277             auto new_data = SP.alloc!uint(total);
3278             // take shrinking into account
3279             auto to_copy = min(total, data.length) - 1;
3280             copy(data[0 .. to_copy], new_data[0 .. to_copy]);
3281             data = new_data; // before setting refCount!
3282             refCount = 1;
3283         }
3284         else // 'this' is the only reference
3285         {
3286             // use the realloc (hopefully in-place operation)
3287             data = SP.realloc(data, total);
3288             refCount = 1; // setup a ref-count in the new end of the array
3289         }
3290     }
3291 
3292     alias opDollar = length;
3293 
3294     uint opIndex()(size_t idx)const
3295     {
3296         return data[idx];
3297     }
3298 
3299     void opIndexAssign(uint val, size_t idx)
3300     {
3301         auto cnt = refCount;
3302         if (cnt != 1)
3303             dupThisReference(cnt);
3304         data[idx] = val;
3305     }
3306 
3307     //
3308     auto opSlice(size_t from, size_t to)
3309     {
3310         if (!empty)
3311         {
3312             auto cnt = refCount;
3313             if (cnt != 1)
3314                 dupThisReference(cnt);
3315         }
3316         return data[from .. to];
3317 
3318     }
3319 
3320     //
3321     auto opSlice(size_t from, size_t to) const
3322     {
3323         return data[from .. to];
3324     }
3325 
3326     // length slices before the ref count
3327     auto opSlice()
3328     {
3329         return opSlice(0, length);
3330     }
3331 
3332     // ditto
3333     auto opSlice() const
3334     {
3335         return opSlice(0, length);
3336     }
3337 
3338     void append(Range)(Range range)
3339         if (isInputRange!Range && hasLength!Range && is(ElementType!Range : uint))
3340     {
3341         size_t nl = length + range.length;
3342         length = nl;
3343         copy(range, this[nl-range.length .. nl]);
3344     }
3345 
3346     void append()(uint[] val...)
3347     {
3348         length = length + val.length;
3349         data[$-val.length-1 .. $-1] = val[];
3350     }
3351 
3352     bool opEquals()(auto const ref CowArray rhs)const
3353     {
3354         if (empty ^ rhs.empty)
3355             return false; // one is empty and the other isn't
3356         return empty || data[0..$-1] == rhs.data[0..$-1];
3357     }
3358 
3359 private:
3360     // ref-count is right after the data
3361     @property uint refCount() const
3362     {
3363         return data[$-1];
3364     }
3365 
3366     @property void refCount(uint cnt)
3367     {
3368         data[$-1] = cnt;
3369     }
3370 
3371     void freeThisReference()
3372     {
3373         immutable count = refCount;
3374         if (count != 1) // have more references to this memory
3375         {
3376             // dec shared ref-count
3377             refCount = count - 1;
3378             data = [];
3379         }
3380         else
3381             SP.destroy(data);
3382         assert(!data.ptr);
3383     }
3384 
3385     void dupThisReference(uint count)
3386     in
3387     {
3388         assert(!empty && count != 1 && count == refCount);
3389     }
3390     do
3391     {
3392         import std.algorithm.mutation : copy;
3393         // dec shared ref-count
3394         refCount = count - 1;
3395         // copy to the new chunk of RAM
3396         auto new_data = SP.alloc!uint(data.length);
3397         // bit-blit old stuff except the counter
3398         copy(data[0..$-1], new_data[0..$-1]);
3399         data = new_data; // before setting refCount!
3400         refCount = 1; // so that this updates the right one
3401     }
3402 
3403     uint[] data;
3404 }
3405 
3406 pure @safe unittest// Uint24 tests
3407 {
3408     import std.algorithm.comparison : equal;
3409     import std.algorithm.mutation : copy;
3410     import std.conv : text;
3411     import std.range : iota, chain;
3412     import std.range.primitives : isBidirectionalRange, isOutputRange;
3413     void funcRef(T)(ref T u24)
3414     {
3415         u24.length = 2;
3416         u24[1] = 1024;
3417         T u24_c = u24;
3418         assert(u24[1] == 1024);
3419         u24.length = 0;
3420         assert(u24.empty);
3421         u24.append([1, 2]);
3422         assert(equal(u24[], [1, 2]));
3423         u24.append(111);
3424         assert(equal(u24[], [1, 2, 111]));
3425         assert(!u24_c.empty && u24_c[1] == 1024);
3426         u24.length = 3;
3427         copy(iota(0, 3), u24[]);
3428         assert(equal(u24[], iota(0, 3)));
3429         assert(u24_c[1] == 1024);
3430     }
3431 
3432     void func2(T)(T u24)
3433     {
3434         T u24_2 = u24;
3435         T u24_3;
3436         u24_3 = u24_2;
3437         assert(u24_2 == u24_3);
3438         assert(equal(u24[], u24_2[]));
3439         assert(equal(u24_2[], u24_3[]));
3440         funcRef(u24_3);
3441 
3442         assert(equal(u24_3[], iota(0, 3)));
3443         assert(!equal(u24_2[], u24_3[]));
3444         assert(equal(u24_2[], u24[]));
3445         u24_2 = u24_3;
3446         assert(equal(u24_2[], iota(0, 3)));
3447         // to test that passed arg is intact outside
3448         // plus try out opEquals
3449         u24 = u24_3;
3450         u24 = T.init;
3451         u24_3 = T.init;
3452         assert(u24.empty);
3453         assert(u24 == u24_3);
3454         assert(u24 != u24_2);
3455     }
3456 
3457     static foreach (Policy; AliasSeq!(GcPolicy, ReallocPolicy))
3458     {{
3459         alias Range = typeof(CowArray!Policy.init[]);
3460         alias U24A = CowArray!Policy;
3461         static assert(isForwardRange!Range);
3462         static assert(isBidirectionalRange!Range);
3463         static assert(isOutputRange!(Range, uint));
3464         static assert(isRandomAccessRange!(Range));
3465 
3466         auto arr = U24A([42u, 36, 100]);
3467         assert(arr[0] == 42);
3468         assert(arr[1] == 36);
3469         arr[0] = 72;
3470         arr[1] = 0xFE_FEFE;
3471         assert(arr[0] == 72);
3472         assert(arr[1] == 0xFE_FEFE);
3473         assert(arr[2] == 100);
3474         U24A arr2 = arr;
3475         assert(arr2[0] == 72);
3476         arr2[0] = 11;
3477         // test COW-ness
3478         assert(arr[0] == 72);
3479         assert(arr2[0] == 11);
3480         // set this to about 100M to stress-test COW memory management
3481         foreach (v; 0 .. 10_000)
3482             func2(arr);
3483         assert(equal(arr[], [72, 0xFE_FEFE, 100]));
3484 
3485         auto r2 = U24A(iota(0, 100));
3486         assert(equal(r2[], iota(0, 100)), text(r2[]));
3487         copy(iota(10, 170, 2), r2[10 .. 90]);
3488         assert(equal(r2[], chain(iota(0, 10), iota(10, 170, 2), iota(90, 100)))
3489                , text(r2[]));
3490     }}
3491 }
3492 
3493 pure @safe unittest// core set primitives test
3494 {
3495     import std.conv : text;
3496     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3497     foreach (CodeList; AllSets)
3498     {
3499         CodeList a;
3500         //"plug a hole" test
3501         a.add(10, 20).add(25, 30).add(15, 27);
3502         assert(a == CodeList(10, 30), text(a));
3503 
3504         auto x = CodeList.init;
3505         x.add(10, 20).add(30, 40).add(50, 60);
3506 
3507         a = x;
3508         a.add(20, 49);//[10, 49) [50, 60)
3509         assert(a == CodeList(10, 49, 50 ,60));
3510 
3511         a = x;
3512         a.add(20, 50);
3513         assert(a == CodeList(10, 60), text(a));
3514 
3515         // simple unions, mostly edge effects
3516         x = CodeList.init;
3517         x.add(10, 20).add(40, 60);
3518 
3519         a = x;
3520         a.add(10, 25); //[10, 25) [40, 60)
3521         assert(a == CodeList(10, 25, 40, 60));
3522 
3523         a = x;
3524         a.add(5, 15); //[5, 20) [40, 60)
3525         assert(a == CodeList(5, 20, 40, 60));
3526 
3527         a = x;
3528         a.add(0, 10); // [0, 20) [40, 60)
3529         assert(a == CodeList(0, 20, 40, 60));
3530 
3531         a = x;
3532         a.add(0, 5); // prepand
3533         assert(a == CodeList(0, 5, 10, 20, 40, 60), text(a));
3534 
3535         a = x;
3536         a.add(5, 20);
3537         assert(a == CodeList(5, 20, 40, 60));
3538 
3539         a = x;
3540         a.add(3, 37);
3541         assert(a == CodeList(3, 37, 40, 60));
3542 
3543         a = x;
3544         a.add(37, 65);
3545         assert(a == CodeList(10, 20, 37, 65));
3546 
3547         // some tests on helpers for set intersection
3548         x = CodeList.init.add(10, 20).add(40, 60).add(100, 120);
3549         a = x;
3550 
3551         auto m = a.skipUpTo(60);
3552         a.dropUpTo(110, m);
3553         assert(a == CodeList(10, 20, 40, 60, 110, 120), text(a.data[]));
3554 
3555         a = x;
3556         a.dropUpTo(100);
3557         assert(a == CodeList(100, 120), text(a.data[]));
3558 
3559         a = x;
3560         m = a.skipUpTo(50);
3561         a.dropUpTo(140, m);
3562         assert(a == CodeList(10, 20, 40, 50), text(a.data[]));
3563         a = x;
3564         a.dropUpTo(60);
3565         assert(a == CodeList(100, 120), text(a.data[]));
3566     }
3567 }
3568 
3569 
3570 //test constructor to work with any order of intervals
3571 pure @safe unittest
3572 {
3573     import std.algorithm.comparison : equal;
3574     import std.conv : text, to;
3575     import std.range : chain, iota;
3576     import std.typecons : tuple;
3577     //ensure constructor handles bad ordering and overlap
3578     auto c1 = CodepointSet('а', 'я'+1, 'А','Я'+1);
3579     foreach (ch; chain(iota('а', 'я'+1), iota('А','Я'+1)))
3580         assert(ch in c1, to!string(ch));
3581 
3582     //contiguos
3583     assert(CodepointSet(1000, 1006, 1006, 1009)
3584         .byInterval.equal([tuple(1000, 1009)]));
3585     //contains
3586     assert(CodepointSet(900, 1200, 1000, 1100)
3587         .byInterval.equal([tuple(900, 1200)]));
3588     //intersect left
3589     assert(CodepointSet(900, 1100, 1000, 1200)
3590         .byInterval.equal([tuple(900, 1200)]));
3591     //intersect right
3592     assert(CodepointSet(1000, 1200, 900, 1100)
3593         .byInterval.equal([tuple(900, 1200)]));
3594 
3595     //ditto with extra items at end
3596     assert(CodepointSet(1000, 1200, 900, 1100, 800, 850)
3597         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3598     assert(CodepointSet(900, 1100, 1000, 1200, 800, 850)
3599         .byInterval.equal([tuple(800, 850), tuple(900, 1200)]));
3600 
3601     //"plug a hole" test
3602     auto c2 = CodepointSet(20, 40,
3603         60, 80, 100, 140, 150, 200,
3604         40, 60, 80, 100, 140, 150
3605     );
3606     assert(c2.byInterval.equal([tuple(20, 200)]));
3607 
3608     auto c3 = CodepointSet(
3609         20, 40, 60, 80, 100, 140, 150, 200,
3610         0, 10, 15, 100, 10, 20, 200, 220);
3611     assert(c3.byInterval.equal([tuple(0, 140), tuple(150, 220)]));
3612 }
3613 
3614 
3615 pure @safe unittest
3616 {   // full set operations
3617     import std.conv : text;
3618     alias AllSets = AliasSeq!(InversionList!GcPolicy, InversionList!ReallocPolicy);
3619     foreach (CodeList; AllSets)
3620     {
3621         CodeList a, b, c, d;
3622 
3623         //"plug a hole"
3624         a.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3625         b.add(40, 60).add(80, 100).add(140, 150);
3626         c = a | b;
3627         d = b | a;
3628         assert(c == CodeList(20, 200), text(CodeList.stringof," ", c));
3629         assert(c == d, text(c," vs ", d));
3630 
3631         b = CodeList.init.add(25, 45).add(65, 85).add(95,110).add(150, 210);
3632         c = a | b; //[20,45) [60, 85) [95, 140) [150, 210)
3633         d = b | a;
3634         assert(c == CodeList(20, 45, 60, 85, 95, 140, 150, 210), text(c));
3635         assert(c == d, text(c," vs ", d));
3636 
3637         b = CodeList.init.add(10, 20).add(30,100).add(145,200);
3638         c = a | b;//[10, 140) [145, 200)
3639         d = b | a;
3640         assert(c == CodeList(10, 140, 145, 200));
3641         assert(c == d, text(c," vs ", d));
3642 
3643         b = CodeList.init.add(0, 10).add(15, 100).add(10, 20).add(200, 220);
3644         c = a | b;//[0, 140) [150, 220)
3645         d = b | a;
3646         assert(c == CodeList(0, 140, 150, 220));
3647         assert(c == d, text(c," vs ", d));
3648 
3649 
3650         a = CodeList.init.add(20, 40).add(60, 80);
3651         b = CodeList.init.add(25, 35).add(65, 75);
3652         c = a & b;
3653         d = b & a;
3654         assert(c == CodeList(25, 35, 65, 75), text(c));
3655         assert(c == d, text(c," vs ", d));
3656 
3657         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3658         b = CodeList.init.add(25, 35).add(65, 75).add(110, 130).add(160, 180);
3659         c = a & b;
3660         d = b & a;
3661         assert(c == CodeList(25, 35, 65, 75, 110, 130, 160, 180), text(c));
3662         assert(c == d, text(c," vs ", d));
3663 
3664         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3665         b = CodeList.init.add(10, 30).add(60, 120).add(135, 160);
3666         c = a & b;//[20, 30)[60, 80) [100, 120) [135, 140) [150, 160)
3667         d = b & a;
3668 
3669         assert(c == CodeList(20, 30, 60, 80, 100, 120, 135, 140, 150, 160),text(c));
3670         assert(c == d, text(c, " vs ",d));
3671         assert((c & a) == c);
3672         assert((d & b) == d);
3673         assert((c & d) == d);
3674 
3675         b = CodeList.init.add(40, 60).add(80, 100).add(140, 200);
3676         c = a & b;
3677         d = b & a;
3678         assert(c == CodeList(150, 200), text(c));
3679         assert(c == d, text(c, " vs ",d));
3680         assert((c & a) == c);
3681         assert((d & b) == d);
3682         assert((c & d) == d);
3683 
3684         assert((a & a) == a);
3685         assert((b & b) == b);
3686 
3687         a = CodeList.init.add(20, 40).add(60, 80).add(100, 140).add(150, 200);
3688         b = CodeList.init.add(30, 60).add(75, 120).add(190, 300);
3689         c = a - b;// [30, 40) [60, 75) [120, 140) [150, 190)
3690         d = b - a;// [40, 60) [80, 100) [200, 300)
3691         assert(c == CodeList(20, 30, 60, 75, 120, 140, 150, 190), text(c));
3692         assert(d == CodeList(40, 60, 80, 100, 200, 300), text(d));
3693         assert(c - d == c, text(c-d, " vs ", c));
3694         assert(d - c == d, text(d-c, " vs ", d));
3695         assert(c - c == CodeList.init);
3696         assert(d - d == CodeList.init);
3697 
3698         a = CodeList.init.add(20, 40).add( 60, 80).add(100, 140).add(150,            200);
3699         b = CodeList.init.add(10,  50).add(60,                           160).add(190, 300);
3700         c = a - b;// [160, 190)
3701         d = b - a;// [10, 20) [40, 50) [80, 100) [140, 150) [200, 300)
3702         assert(c == CodeList(160, 190), text(c));
3703         assert(d == CodeList(10, 20, 40, 50, 80, 100, 140, 150, 200, 300), text(d));
3704         assert(c - d == c, text(c-d, " vs ", c));
3705         assert(d - c == d, text(d-c, " vs ", d));
3706         assert(c - c == CodeList.init);
3707         assert(d - d == CodeList.init);
3708 
3709         a = CodeList.init.add(20,    40).add(60, 80).add(100,      140).add(150,  200);
3710         b = CodeList.init.add(10, 30).add(45,         100).add(130,             190);
3711         c = a ~ b; // [10, 20) [30, 40) [45, 60) [80, 130) [140, 150) [190, 200)
3712         d = b ~ a;
3713         assert(c == CodeList(10, 20, 30, 40, 45, 60, 80, 130, 140, 150, 190, 200),
3714                text(c));
3715         assert(c == d, text(c, " vs ", d));
3716     }
3717 }
3718 
3719 }
3720 
3721 pure @safe unittest// vs single dchar
3722 {
3723     import std.conv : text;
3724     CodepointSet a = CodepointSet(10, 100, 120, 200);
3725     assert(a - 'A' == CodepointSet(10, 65, 66, 100, 120, 200), text(a - 'A'));
3726     assert((a & 'B') == CodepointSet(66, 67));
3727 }
3728 
3729 pure @safe unittest// iteration & opIndex
3730 {
3731     import std.algorithm.comparison : equal;
3732     import std.conv : text;
3733     import std.typecons : tuple, Tuple;
3734 
3735     static foreach (CodeList; AliasSeq!(InversionList!(ReallocPolicy)))
3736     {{
3737         auto arr = "ABCDEFGHIJKLMabcdefghijklm"d;
3738         auto a = CodeList('A','N','a', 'n');
3739         assert(equal(a.byInterval,
3740                 [tuple(cast(uint)'A', cast(uint)'N'), tuple(cast(uint)'a', cast(uint)'n')]
3741             ), text(a.byInterval));
3742 
3743         // same @@@BUG as in https://issues.dlang.org/show_bug.cgi?id=8949 ?
3744         version (bug8949)
3745         {
3746             import std.range : retro;
3747             assert(equal(retro(a.byInterval),
3748                 [tuple(cast(uint)'a', cast(uint)'n'), tuple(cast(uint)'A', cast(uint)'N')]
3749             ), text(retro(a.byInterval)));
3750         }
3751         auto achr = a.byCodepoint;
3752         assert(equal(achr, arr), text(a.byCodepoint));
3753         foreach (ch; a.byCodepoint)
3754             assert(a[ch]);
3755         auto x = CodeList(100, 500, 600, 900, 1200, 1500);
3756         assert(equal(x.byInterval, [ tuple(100, 500), tuple(600, 900), tuple(1200, 1500)]), text(x.byInterval));
3757         foreach (ch; x.byCodepoint)
3758             assert(x[ch]);
3759         static if (is(CodeList == CodepointSet))
3760         {
3761             auto y = CodeList(x.byInterval);
3762             assert(equal(x.byInterval, y.byInterval));
3763         }
3764         assert(equal(CodepointSet.init.byInterval, cast(Tuple!(uint, uint)[])[]));
3765         assert(equal(CodepointSet.init.byCodepoint, cast(dchar[])[]));
3766     }}
3767 }
3768 
3769 //============================================================================
3770 // Generic Trie template and various ways to build it
3771 //============================================================================
3772 
3773 // debug helper to get a shortened array dump
3774 auto arrayRepr(T)(T x)
3775 {
3776     import std.conv : text;
3777     if (x.length > 32)
3778     {
3779         return text(x[0 .. 16],"~...~", x[x.length-16 .. x.length]);
3780     }
3781     else
3782         return text(x);
3783 }
3784 
3785 /**
3786     Maps `Key` to a suitable integer index within the range of `size_t`.
3787     The mapping is constructed by applying predicates from `Prefix` left to right
3788     and concatenating the resulting bits.
3789 
3790     The first (leftmost) predicate defines the most significant bits of
3791     the resulting index.
3792  */
3793 template mapTrieIndex(Prefix...)
3794 {
3795     size_t mapTrieIndex(Key)(Key key)
3796         if (isValidPrefixForTrie!(Key, Prefix))
3797     {
3798         alias p = Prefix;
3799         size_t idx;
3800         foreach (i, v; p[0..$-1])
3801         {
3802             idx |= p[i](key);
3803             idx <<= p[i+1].bitSize;
3804         }
3805         idx |= p[$-1](key);
3806         return idx;
3807     }
3808 }
3809 
3810 /*
3811     `TrieBuilder` is a type used for incremental construction
3812     of $(LREF Trie)s.
3813 
3814     See $(LREF buildTrie) for generic helpers built on top of it.
3815 */
3816 @trusted private struct TrieBuilder(Value, Key, Args...)
3817 if (isBitPackableType!Value && isValidArgsForTrie!(Key, Args))
3818 {
3819     import std.exception : enforce;
3820 
3821 private:
3822     // last index is not stored in table, it is used as an offset to values in a block.
3823     static if (is(Value == bool))// always pack bool
3824         alias V = BitPacked!(Value, 1);
3825     else
3826         alias V = Value;
3827     static auto deduceMaxIndex(Preds...)()
3828     {
3829         size_t idx = 1;
3830         foreach (v; Preds)
3831             idx *= 2^^v.bitSize;
3832         return idx;
3833     }
3834 
3835     static if (is(typeof(Args[0]) : Key)) // Args start with upper bound on Key
3836     {
3837         alias Prefix = Args[1..$];
3838         enum lastPageSize = 2^^Prefix[$-1].bitSize;
3839         enum translatedMaxIndex = mapTrieIndex!(Prefix)(Args[0]);
3840         enum roughedMaxIndex =
3841             (translatedMaxIndex + lastPageSize-1)/lastPageSize*lastPageSize;
3842         // check warp around - if wrapped, use the default deduction rule
3843         enum maxIndex = roughedMaxIndex < translatedMaxIndex ?
3844             deduceMaxIndex!(Prefix)() : roughedMaxIndex;
3845     }
3846     else
3847     {
3848         alias Prefix = Args;
3849         enum maxIndex = deduceMaxIndex!(Prefix)();
3850     }
3851 
3852     alias getIndex = mapTrieIndex!(Prefix);
3853 
3854     enum lastLevel = Prefix.length-1;
3855     struct ConstructState
3856     {
3857         size_t idx_zeros, idx_ones;
3858     }
3859     // iteration over levels of Trie, each indexes its own level and thus a shortened domain
3860     size_t[Prefix.length] indices;
3861     // default filler value to use
3862     Value defValue;
3863     // this is a full-width index of next item
3864     size_t curIndex;
3865     // all-zeros page index, all-ones page index (+ indicator if there is such a page)
3866     ConstructState[Prefix.length] state;
3867     // the table being constructed
3868     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), V) table;
3869 
3870     @disable this();
3871 
3872     //shortcut for index variable at level 'level'
3873     @property ref idx(size_t level)(){ return indices[level]; }
3874 
3875     // this function assumes no holes in the input so
3876     // indices are going one by one
3877     void addValue(size_t level, T)(T val, size_t numVals)
3878     {
3879         alias j = idx!level;
3880         enum pageSize = 1 << Prefix[level].bitSize;
3881         if (numVals == 0)
3882             return;
3883         auto ptr = table.slice!(level);
3884         if (numVals == 1)
3885         {
3886             static if (level == Prefix.length-1)
3887                 ptr[j] = val;
3888             else
3889             {// can incur narrowing conversion
3890                 assert(j < ptr.length);
3891                 ptr[j] = force!(typeof(ptr[j]))(val);
3892             }
3893             j++;
3894             if (j % pageSize == 0)
3895                 spillToNextPage!level(ptr);
3896             return;
3897         }
3898         // longer row of values
3899         // get to the next page boundary
3900         immutable nextPB = (j + pageSize) & ~(pageSize-1);
3901         immutable n =  nextPB - j;// can fill right in this page
3902         if (numVals < n) //fits in current page
3903         {
3904             ptr[j .. j+numVals]  = val;
3905             j += numVals;
3906             return;
3907         }
3908         static if (level != 0)//on the first level it always fits
3909         {
3910             numVals -= n;
3911             //write till the end of current page
3912             ptr[j .. j+n]  = val;
3913             j += n;
3914             //spill to the next page
3915             spillToNextPage!level(ptr);
3916             // page at once loop
3917             if (state[level].idx_zeros != size_t.max && val == T.init)
3918             {
3919                 alias NextIdx = typeof(table.slice!(level-1)[0]);
3920                 addValue!(level-1)(force!NextIdx(state[level].idx_zeros),
3921                     numVals/pageSize);
3922                 ptr = table.slice!level; //table structure might have changed
3923                 numVals %= pageSize;
3924             }
3925             else
3926             {
3927                 while (numVals >= pageSize)
3928                 {
3929                     numVals -= pageSize;
3930                     ptr[j .. j+pageSize]  = val;
3931                     j += pageSize;
3932                     spillToNextPage!level(ptr);
3933                 }
3934             }
3935             if (numVals)
3936             {
3937                 // the leftovers, an incomplete page
3938                 ptr[j .. j+numVals]  = val;
3939                 j += numVals;
3940             }
3941         }
3942     }
3943 
3944     void spillToNextPage(size_t level, Slice)(ref Slice ptr)
3945     {
3946         // last level (i.e. topmost) has 1 "page"
3947         // thus it need not to add a new page on upper level
3948         static if (level != 0)
3949             spillToNextPageImpl!(level)(ptr);
3950     }
3951 
3952     // this can re-use the current page if duplicate or allocate a new one
3953     // it also makes sure that previous levels point to the correct page in this level
3954     void spillToNextPageImpl(size_t level, Slice)(ref Slice ptr)
3955     {
3956         alias NextIdx = typeof(table.slice!(level-1)[0]);
3957         NextIdx next_lvl_index;
3958         enum pageSize = 1 << Prefix[level].bitSize;
3959         assert(idx!level % pageSize == 0);
3960         immutable last = idx!level-pageSize;
3961         const slice = ptr[idx!level - pageSize .. idx!level];
3962         size_t j;
3963         for (j=0; j<last; j+=pageSize)
3964         {
3965             if (ptr[j .. j+pageSize] == slice)
3966             {
3967                 // get index to it, reuse ptr space for the next block
3968                 next_lvl_index = force!NextIdx(j/pageSize);
3969                 version (none)
3970                 {
3971                 import std.stdio : writefln, writeln;
3972                 writefln("LEVEL(%s) page mapped idx: %s: 0..%s  ---> [%s..%s]"
3973                         ,level
3974                         ,indices[level-1], pageSize, j, j+pageSize);
3975                 writeln("LEVEL(", level
3976                         , ") mapped page is: ", slice, ": ", arrayRepr(ptr[j .. j+pageSize]));
3977                 writeln("LEVEL(", level
3978                         , ") src page is :", ptr, ": ", arrayRepr(slice[0 .. pageSize]));
3979                 }
3980                 idx!level -= pageSize; // reuse this page, it is duplicate
3981                 break;
3982             }
3983         }
3984         if (j == last)
3985         {
3986     L_allocate_page:
3987             next_lvl_index = force!NextIdx(idx!level/pageSize - 1);
3988             if (state[level].idx_zeros == size_t.max && ptr.zeros(j, j+pageSize))
3989             {
3990                 state[level].idx_zeros = next_lvl_index;
3991             }
3992             // allocate next page
3993             version (none)
3994             {
3995             import std.stdio : writefln;
3996             writefln("LEVEL(%s) page allocated: %s"
3997                      , level, arrayRepr(slice[0 .. pageSize]));
3998             writefln("LEVEL(%s) index: %s ; page at this index %s"
3999                      , level
4000                      , next_lvl_index
4001                      , arrayRepr(
4002                          table.slice!(level)
4003                           [pageSize*next_lvl_index..(next_lvl_index+1)*pageSize]
4004                         ));
4005             }
4006             table.length!level = table.length!level + pageSize;
4007         }
4008     L_know_index:
4009         // for the previous level, values are indices to the pages in the current level
4010         addValue!(level-1)(next_lvl_index, 1);
4011         ptr = table.slice!level; //re-load the slice after moves
4012     }
4013 
4014     // idx - full-width index to fill with v (full-width index != key)
4015     // fills everything in the range of [curIndex, idx) with filler
4016     void putAt(size_t idx, Value v)
4017     {
4018         assert(idx >= curIndex);
4019         immutable numFillers = idx - curIndex;
4020         addValue!lastLevel(defValue, numFillers);
4021         addValue!lastLevel(v, 1);
4022         curIndex = idx + 1;
4023     }
4024 
4025     // ditto, but sets the range of [idxA, idxB) to v
4026     void putRangeAt(size_t idxA, size_t idxB, Value v)
4027     {
4028         assert(idxA >= curIndex);
4029         assert(idxB >= idxA);
4030         size_t numFillers = idxA - curIndex;
4031         addValue!lastLevel(defValue, numFillers);
4032         addValue!lastLevel(v, idxB - idxA);
4033         curIndex = idxB; // open-right
4034     }
4035 
4036     enum errMsg = "non-monotonic prefix function(s), an unsorted range or "~
4037         "duplicate key->value mapping";
4038 
4039 public:
4040     /**
4041         Construct a builder, where `filler` is a value
4042         to indicate empty slots (or "not found" condition).
4043     */
4044     this(Value filler)
4045     {
4046         curIndex = 0;
4047         defValue = filler;
4048         // zeros-page index, ones-page index
4049         foreach (ref v; state)
4050             v = ConstructState(size_t.max, size_t.max);
4051         table = typeof(table)(indices);
4052         // one page per level is a bootstrap minimum
4053         foreach (i, Pred; Prefix)
4054             table.length!i = (1 << Pred.bitSize);
4055     }
4056 
4057     /**
4058         Put a value `v` into interval as
4059         mapped by keys from `a` to `b`.
4060         All slots prior to `a` are filled with
4061         the default filler.
4062     */
4063     void putRange(Key a, Key b, Value v)
4064     {
4065         auto idxA = getIndex(a), idxB = getIndex(b);
4066         // indexes of key should always grow
4067         enforce(idxB >= idxA && idxA >= curIndex, errMsg);
4068         putRangeAt(idxA, idxB, v);
4069     }
4070 
4071     /**
4072         Put a value `v` into slot mapped by `key`.
4073         All slots prior to `key` are filled with the
4074         default filler.
4075     */
4076     void putValue(Key key, Value v)
4077     {
4078         auto idx = getIndex(key);
4079         enforce(idx >= curIndex, errMsg);
4080         putAt(idx, v);
4081     }
4082 
4083     /// Finishes construction of Trie, yielding an immutable Trie instance.
4084     auto build()
4085     {
4086         static if (maxIndex != 0) // doesn't cover full range of size_t
4087         {
4088             assert(curIndex <= maxIndex);
4089             addValue!lastLevel(defValue, maxIndex - curIndex);
4090         }
4091         else
4092         {
4093             if (curIndex != 0 // couldn't wrap around
4094                 || (Prefix.length != 1 && indices[lastLevel] == 0)) // can be just empty
4095             {
4096                 addValue!lastLevel(defValue, size_t.max - curIndex);
4097                 addValue!lastLevel(defValue, 1);
4098             }
4099             // else curIndex already completed the full range of size_t by wrapping around
4100         }
4101         return Trie!(V, Key, maxIndex, Prefix)(table);
4102     }
4103 }
4104 
4105 /**
4106     $(P A generic Trie data-structure for a fixed number of stages.
4107     The design goal is optimal speed with smallest footprint size.
4108     )
4109     $(P It's intentionally read-only and doesn't provide constructors.
4110      To construct one use a special builder,
4111      see $(LREF TrieBuilder) and $(LREF buildTrie).
4112     )
4113 
4114 */
4115 @trusted private struct Trie(Value, Key, Args...)
4116 if (isValidPrefixForTrie!(Key, Args)
4117     || (isValidPrefixForTrie!(Key, Args[1..$])
4118     && is(typeof(Args[0]) : size_t)))
4119 {
4120     import std.range.primitives : isOutputRange;
4121     static if (is(typeof(Args[0]) : size_t))
4122     {
4123         private enum maxIndex = Args[0];
4124         private enum hasBoundsCheck = true;
4125         private alias Prefix = Args[1..$];
4126     }
4127     else
4128     {
4129         private enum hasBoundsCheck = false;
4130         private alias Prefix = Args;
4131     }
4132 
4133     private this()(typeof(_table) table)
4134     {
4135         _table = table;
4136     }
4137 
4138     // only for constant Tries constructed from precompiled tables
4139     private this()(const(size_t)[] offsets, const(size_t)[] sizes,
4140         const(size_t)[] data) const
4141     {
4142         _table = typeof(_table)(offsets, sizes, data);
4143     }
4144 
4145     /**
4146         $(P Lookup the `key` in this `Trie`. )
4147 
4148         $(P The lookup always succeeds if key fits the domain
4149         provided during construction. The whole domain defined
4150         is covered so instead of not found condition
4151         the sentinel (filler) value could be used. )
4152 
4153         $(P See $(LREF buildTrie), $(LREF TrieBuilder) for how to
4154         define a domain of `Trie` keys and the sentinel value. )
4155 
4156         Note:
4157         Domain range-checking is only enabled in debug builds
4158         and results in assertion failure.
4159     */
4160     TypeOfBitPacked!Value opIndex()(Key key) const
4161     {
4162         static if (hasBoundsCheck)
4163             assert(mapTrieIndex!Prefix(key) < maxIndex);
4164         size_t idx;
4165         alias p = Prefix;
4166         idx = cast(size_t) p[0](key);
4167         foreach (i, v; p[0..$-1])
4168             idx = cast(size_t)((_table.ptr!i[idx]<<p[i+1].bitSize) + p[i+1](key));
4169         return _table.ptr!(p.length-1)[idx];
4170     }
4171 
4172     ///
4173     @property size_t bytes(size_t n=size_t.max)() const
4174     {
4175         return _table.bytes!n;
4176     }
4177 
4178     ///
4179     @property size_t pages(size_t n)() const
4180     {
4181         return (bytes!n+2^^(Prefix[n].bitSize-1))
4182                 /2^^Prefix[n].bitSize;
4183     }
4184 
4185     ///
4186     void store(OutRange)(scope OutRange sink) const
4187         if (isOutputRange!(OutRange, char))
4188     {
4189         _table.store(sink);
4190     }
4191 
4192 private:
4193     MultiArray!(idxTypes!(Key, fullBitSize!(Prefix), Prefix[0..$]), Value) _table;
4194 }
4195 
4196 // create a tuple of 'sliceBits' that slice the 'top' of bits into pieces of sizes 'sizes'
4197 // left-to-right, the most significant bits first
4198 template GetBitSlicing(size_t top, sizes...)
4199 {
4200     static if (sizes.length > 0)
4201         alias GetBitSlicing =
4202             AliasSeq!(sliceBits!(top - sizes[0], top),
4203                       GetBitSlicing!(top - sizes[0], sizes[1..$]));
4204     else
4205         alias GetBitSlicing = AliasSeq!();
4206 }
4207 
4208 template callableWith(T)
4209 {
4210     template callableWith(alias Pred)
4211     {
4212         static if (!is(typeof(Pred(T.init))))
4213             enum callableWith = false;
4214         else
4215         {
4216             alias Result = typeof(Pred(T.init));
4217             enum callableWith = isBitPackableType!(TypeOfBitPacked!(Result));
4218         }
4219     }
4220 }
4221 
4222 /*
4223     Check if `Prefix` is a valid set of predicates
4224     for `Trie` template having `Key` as the type of keys.
4225     This requires all predicates to be callable, take
4226     single argument of type `Key` and return unsigned value.
4227 */
4228 template isValidPrefixForTrie(Key, Prefix...)
4229 {
4230     import std.meta : allSatisfy;
4231     enum isValidPrefixForTrie = allSatisfy!(callableWith!Key, Prefix); // TODO: tighten the screws
4232 }
4233 
4234 /*
4235     Check if `Args` is a set of maximum key value followed by valid predicates
4236     for `Trie` template having `Key` as the type of keys.
4237 */
4238 template isValidArgsForTrie(Key, Args...)
4239 {
4240     static if (Args.length > 1)
4241     {
4242         enum isValidArgsForTrie = isValidPrefixForTrie!(Key, Args)
4243             || (isValidPrefixForTrie!(Key, Args[1..$]) && is(typeof(Args[0]) : Key));
4244     }
4245     else
4246         enum isValidArgsForTrie = isValidPrefixForTrie!Args;
4247 }
4248 
4249 @property size_t sumOfIntegerTuple(ints...)()
4250 {
4251     size_t count=0;
4252     foreach (v; ints)
4253         count += v;
4254     return count;
4255 }
4256 
4257 /**
4258     A shorthand for creating a custom multi-level fixed Trie
4259     from a `CodepointSet`. `sizes` are numbers of bits per level,
4260     with the most significant bits used first.
4261 
4262     Note: The sum of `sizes` must be equal 21.
4263 
4264     See_Also: $(LREF toTrie), which is even simpler.
4265 
4266     Example:
4267     ---
4268     {
4269         import std.stdio;
4270         auto set = unicode("Number");
4271         auto trie = codepointSetTrie!(8, 5, 8)(set);
4272         writeln("Input code points to test:");
4273         foreach (line; stdin.byLine)
4274         {
4275             int count=0;
4276             foreach (dchar ch; line)
4277                 if (trie[ch])// is number
4278                     count++;
4279             writefln("Contains %d number code points.", count);
4280         }
4281     }
4282     ---
4283 */
4284 public template codepointSetTrie(sizes...)
4285 if (sumOfIntegerTuple!sizes == 21)
4286 {
4287     auto codepointSetTrie(Set)(Set set)
4288         if (isCodepointSet!Set)
4289     {
4290         auto builder = TrieBuilder!(bool, dchar, lastDchar+1, GetBitSlicing!(21, sizes))(false);
4291         foreach (ival; set.byInterval)
4292             builder.putRange(ival[0], ival[1], true);
4293         return builder.build();
4294     }
4295 }
4296 
4297 /// Type of Trie generated by codepointSetTrie function.
4298 public template CodepointSetTrie(sizes...)
4299 if (sumOfIntegerTuple!sizes == 21)
4300 {
4301     alias Prefix = GetBitSlicing!(21, sizes);
4302     alias CodepointSetTrie = typeof(TrieBuilder!(bool, dchar, lastDchar+1, Prefix)(false).build());
4303 }
4304 
4305 /**
4306     A slightly more general tool for building fixed `Trie`
4307     for the Unicode data.
4308 
4309     Specifically unlike `codepointSetTrie` it's allows creating mappings
4310     of `dchar` to an arbitrary type `T`.
4311 
4312     Note: Overload taking `CodepointSet`s will naturally convert
4313     only to bool mapping `Trie`s.
4314 
4315     CodepointTrie is the type of Trie as generated by codepointTrie function.
4316 */
4317 public template codepointTrie(T, sizes...)
4318 if (sumOfIntegerTuple!sizes == 21)
4319 {
4320     alias Prefix = GetBitSlicing!(21, sizes);
4321 
4322     static if (is(TypeOfBitPacked!T == bool))
4323     {
4324         auto codepointTrie(Set)(const scope Set set)
4325             if (isCodepointSet!Set)
4326         {
4327             return codepointSetTrie(set);
4328         }
4329     }
4330 
4331     ///
4332     auto codepointTrie()(T[dchar] map, T defValue=T.init)
4333     {
4334         return buildTrie!(T, dchar, Prefix)(map, defValue);
4335     }
4336 
4337     // unsorted range of pairs
4338     ///
4339     auto codepointTrie(R)(R range, T defValue=T.init)
4340         if (isInputRange!R
4341             && is(typeof(ElementType!R.init[0]) : T)
4342             && is(typeof(ElementType!R.init[1]) : dchar))
4343     {
4344         // build from unsorted array of pairs
4345         // TODO: expose index sorting functions for Trie
4346         return buildTrie!(T, dchar, Prefix)(range, defValue, true);
4347     }
4348 }
4349 
4350 @system pure unittest
4351 {
4352     import std.algorithm.comparison : max;
4353     import std.algorithm.searching : count;
4354 
4355     // pick characters from the Greek script
4356     auto set = unicode.Greek;
4357 
4358     // a user-defined property (or an expensive function)
4359     // that we want to look up
4360     static uint luckFactor(dchar ch)
4361     {
4362         // here we consider a character lucky
4363         // if its code point has a lot of identical hex-digits
4364         // e.g. arabic letter DDAL (\u0688) has a "luck factor" of 2
4365         ubyte[6] nibbles; // 6 4-bit chunks of code point
4366         uint value = ch;
4367         foreach (i; 0 .. 6)
4368         {
4369             nibbles[i] = value & 0xF;
4370             value >>= 4;
4371         }
4372         uint luck;
4373         foreach (n; nibbles)
4374             luck = cast(uint) max(luck, count(nibbles[], n));
4375         return luck;
4376     }
4377 
4378     // only unsigned built-ins are supported at the moment
4379     alias LuckFactor = BitPacked!(uint, 3);
4380 
4381     // create a temporary associative array (AA)
4382     LuckFactor[dchar] map;
4383     foreach (ch; set.byCodepoint)
4384         map[ch] = LuckFactor(luckFactor(ch));
4385 
4386     // bits per stage are chosen randomly, fell free to optimize
4387     auto trie = codepointTrie!(LuckFactor, 8, 5, 8)(map);
4388 
4389     // from now on the AA is not needed
4390     foreach (ch; set.byCodepoint)
4391         assert(trie[ch] == luckFactor(ch)); // verify
4392     // CJK is not Greek, thus it has the default value
4393     assert(trie['\u4444'] == 0);
4394     // and here is a couple of quite lucky Greek characters:
4395     // Greek small letter epsilon with dasia
4396     assert(trie['\u1F11'] == 3);
4397     // Ancient Greek metretes sign
4398     assert(trie['\U00010181'] == 3);
4399 
4400 }
4401 
4402 /// ditto
4403 public template CodepointTrie(T, sizes...)
4404 if (sumOfIntegerTuple!sizes == 21)
4405 {
4406     alias Prefix = GetBitSlicing!(21, sizes);
4407     alias CodepointTrie = typeof(TrieBuilder!(T, dchar, lastDchar+1, Prefix)(T.init).build());
4408 }
4409 
4410 package(std) template cmpK0(alias Pred)
4411 {
4412     import std.typecons : Tuple;
4413     static bool cmpK0(Value, Key)
4414         (Tuple!(Value, Key) a, Tuple!(Value, Key) b)
4415     {
4416         return Pred(a[1]) < Pred(b[1]);
4417     }
4418 }
4419 
4420 /**
4421     The most general utility for construction of `Trie`s
4422     short of using `TrieBuilder` directly.
4423 
4424     Provides a number of convenience overloads.
4425     `Args` is tuple of maximum key value followed by
4426     predicates to construct index from key.
4427 
4428     Alternatively if the first argument is not a value convertible to `Key`
4429     then the whole tuple of `Args` is treated as predicates
4430     and the maximum Key is deduced from predicates.
4431 */
4432 private template buildTrie(Value, Key, Args...)
4433 if (isValidArgsForTrie!(Key, Args))
4434 {
4435     static if (is(typeof(Args[0]) : Key)) // prefix starts with upper bound on Key
4436     {
4437         alias Prefix = Args[1..$];
4438     }
4439     else
4440         alias Prefix = Args;
4441 
4442     alias getIndex = mapTrieIndex!(Prefix);
4443 
4444     // for multi-sort
4445     template GetComparators(size_t n)
4446     {
4447         static if (n > 0)
4448             alias GetComparators =
4449                 AliasSeq!(GetComparators!(n-1), cmpK0!(Prefix[n-1]));
4450         else
4451             alias GetComparators = AliasSeq!();
4452     }
4453 
4454     /*
4455         Build `Trie` from a range of a Key-Value pairs,
4456         assuming it is sorted by Key as defined by the following lambda:
4457         ------
4458         (a, b) => mapTrieIndex!(Prefix)(a) < mapTrieIndex!(Prefix)(b)
4459         ------
4460         Exception is thrown if it's detected that the above order doesn't hold.
4461 
4462         In other words $(LREF mapTrieIndex) should be a
4463         monotonically increasing function that maps `Key` to an integer.
4464 
4465         See_Also: $(REF sort, std,_algorithm),
4466         $(REF SortedRange, std,range),
4467         $(REF setUnion, std,_algorithm).
4468     */
4469     auto buildTrie(Range)(Range range, Value filler=Value.init)
4470         if (isInputRange!Range && is(typeof(Range.init.front[0]) : Value)
4471             && is(typeof(Range.init.front[1]) : Key))
4472     {
4473         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4474         foreach (v; range)
4475             builder.putValue(v[1], v[0]);
4476         return builder.build();
4477     }
4478 
4479     /*
4480         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4481         to build `Trie` from a range of open-right intervals of `Key`s.
4482         The requirement  on the ordering of keys (and the behavior on the
4483         violation of it) is the same as for Key-Value range overload.
4484 
4485         Intervals denote ranges of !`filler` i.e. the opposite of filler.
4486         If no filler provided keys inside of the intervals map to true,
4487         and `filler` is false.
4488     */
4489     auto buildTrie(Range)(Range range, Value filler=Value.init)
4490         if (is(TypeOfBitPacked!Value ==  bool)
4491             && isInputRange!Range && is(typeof(Range.init.front[0]) : Key)
4492             && is(typeof(Range.init.front[1]) : Key))
4493     {
4494         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4495         foreach (ival; range)
4496             builder.putRange(ival[0], ival[1], !filler);
4497         return builder.build();
4498     }
4499 
4500     auto buildTrie(Range)(Range range, Value filler, bool unsorted)
4501         if (isInputRange!Range
4502             && is(typeof(Range.init.front[0]) : Value)
4503             && is(typeof(Range.init.front[1]) : Key))
4504     {
4505         import std.algorithm.sorting : multiSort;
4506         alias Comps = GetComparators!(Prefix.length);
4507         if (unsorted)
4508             multiSort!(Comps)(range);
4509         return buildTrie(range, filler);
4510     }
4511 
4512     /*
4513         If `Value` is bool (or BitPacked!(bool, x)) then it's possible
4514         to build `Trie` simply from an input range of `Key`s.
4515         The requirement  on the ordering of keys (and the behavior on the
4516         violation of it) is the same as for Key-Value range overload.
4517 
4518         Keys found in range denote !`filler` i.e. the opposite of filler.
4519         If no filler provided keys map to true, and `filler` is false.
4520     */
4521     auto buildTrie(Range)(Range range, Value filler=Value.init)
4522         if (is(TypeOfBitPacked!Value ==  bool)
4523             && isInputRange!Range && is(typeof(Range.init.front) : Key))
4524     {
4525         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4526         foreach (v; range)
4527             builder.putValue(v, !filler);
4528         return builder.build();
4529     }
4530 
4531     /*
4532         If `Key` is unsigned integer `Trie` could be constructed from array
4533         of values where array index serves as key.
4534     */
4535     auto buildTrie()(Value[] array, Value filler=Value.init)
4536         if (isUnsigned!Key)
4537     {
4538         auto builder = TrieBuilder!(Value, Key, Prefix)(filler);
4539         foreach (idx, v; array)
4540             builder.putValue(idx, v);
4541         return builder.build();
4542     }
4543 
4544     /*
4545         Builds `Trie` from associative array.
4546     */
4547     auto buildTrie(Key, Value)(Value[Key] map, Value filler=Value.init)
4548     {
4549         import std.array : array;
4550         import std.range : zip;
4551         auto range = array(zip(map.values, map.keys));
4552         return buildTrie(range, filler, true); // sort it
4553     }
4554 }
4555 
4556 // helper in place of assumeSize to
4557 //reduce mangled name & help DMD inline Trie functors
4558 struct clamp(size_t bits)
4559 {
4560     static size_t opCall(T)(T arg){ return arg; }
4561     enum bitSize = bits;
4562 }
4563 
4564 struct clampIdx(size_t idx, size_t bits)
4565 {
4566     static size_t opCall(T)(T arg){ return arg[idx]; }
4567     enum bitSize = bits;
4568 }
4569 
4570 /**
4571     Conceptual type that outlines the common properties of all UTF Matchers.
4572 
4573     Note: For illustration purposes only, every method
4574     call results in assertion failure.
4575     Use $(LREF utfMatcher) to obtain a concrete matcher
4576     for UTF-8 or UTF-16 encodings.
4577 */
4578 public struct MatcherConcept
4579 {
4580     /**
4581         $(P Perform a semantic equivalent 2 operations:
4582         decoding a $(CODEPOINT) at front of `inp` and testing if
4583         it belongs to the set of $(CODEPOINTS) of this matcher. )
4584 
4585         $(P The effect on `inp` depends on the kind of function called:)
4586 
4587         $(P Match. If the codepoint is found in the set then range `inp`
4588         is advanced by its size in $(S_LINK Code unit, code units),
4589         otherwise the range is not modifed.)
4590 
4591         $(P Skip. The range is always advanced by the size
4592         of the tested $(CODEPOINT) regardless of the result of test.)
4593 
4594         $(P Test. The range is left unaffected regardless
4595         of the result of test.)
4596     */
4597     public bool match(Range)(ref Range inp)
4598         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4599     {
4600        assert(false);
4601     }
4602 
4603     ///ditto
4604     public bool skip(Range)(ref Range inp)
4605         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4606     {
4607         assert(false);
4608     }
4609 
4610     ///ditto
4611     public bool test(Range)(ref Range inp)
4612         if (isRandomAccessRange!Range && is(ElementType!Range : char))
4613     {
4614         assert(false);
4615     }
4616     ///
4617     pure @safe unittest
4618     {
4619         string truth = "2² = 4";
4620         auto m = utfMatcher!char(unicode.Number);
4621         assert(m.match(truth)); // '2' is a number all right
4622         assert(truth == "² = 4"); // skips on match
4623         assert(m.match(truth)); // so is the superscript '2'
4624         assert(!m.match(truth)); // space is not a number
4625         assert(truth == " = 4"); // unaffected on no match
4626         assert(!m.skip(truth)); // same test ...
4627         assert(truth == "= 4"); // but skips a codepoint regardless
4628         assert(!m.test(truth)); // '=' is not a number
4629         assert(truth == "= 4"); // test never affects argument
4630     }
4631 
4632     /**
4633         Advanced feature - provide direct access to a subset of matcher based a
4634         set of known encoding lengths. Lengths are provided in
4635         $(S_LINK Code unit, code units). The sub-matcher then may do less
4636         operations per any `test`/`match`.
4637 
4638         Use with care as the sub-matcher won't match
4639         any $(CODEPOINTS) that have encoded length that doesn't belong
4640         to the selected set of lengths. Also the sub-matcher object references
4641         the parent matcher and must not be used past the liftetime
4642         of the latter.
4643 
4644         Another caveat of using sub-matcher is that skip is not available
4645         preciesly because sub-matcher doesn't detect all lengths.
4646     */
4647     @property auto subMatcher(Lengths...)()
4648     {
4649         assert(0);
4650         return this;
4651     }
4652 
4653     pure @safe unittest
4654     {
4655         auto m = utfMatcher!char(unicode.Number);
4656         string square = "2²";
4657         // about sub-matchers
4658         assert(!m.subMatcher!(2,3,4).test(square)); // ASCII no covered
4659         assert(m.subMatcher!1.match(square)); // ASCII-only, works
4660         assert(!m.subMatcher!1.test(square)); // unicode '²'
4661         assert(m.subMatcher!(2,3,4).match(square));  //
4662         assert(square == "");
4663         wstring wsquare = "2²";
4664         auto m16 = utfMatcher!wchar(unicode.Number);
4665         // may keep ref, but the orignal (m16) must be kept alive
4666         auto bmp = m16.subMatcher!1;
4667         assert(bmp.match(wsquare)); // Okay, in basic multilingual plan
4668         assert(bmp.match(wsquare)); // And '²' too
4669     }
4670 }
4671 
4672 /**
4673     Test if `M` is an UTF Matcher for ranges of `Char`.
4674 */
4675 public enum isUtfMatcher(M, C) = __traits(compiles, (){
4676     C[] s;
4677     auto d = s.decoder;
4678     M m;
4679     assert(is(typeof(m.match(d)) == bool));
4680     assert(is(typeof(m.test(d)) == bool));
4681     static if (is(typeof(m.skip(d))))
4682     {
4683         assert(is(typeof(m.skip(d)) == bool));
4684         assert(is(typeof(m.skip(s)) == bool));
4685     }
4686     assert(is(typeof(m.match(s)) == bool));
4687     assert(is(typeof(m.test(s)) == bool));
4688 });
4689 
4690 pure @safe unittest
4691 {
4692     alias CharMatcher = typeof(utfMatcher!char(CodepointSet.init));
4693     alias WcharMatcher = typeof(utfMatcher!wchar(CodepointSet.init));
4694     static assert(isUtfMatcher!(CharMatcher, char));
4695     static assert(isUtfMatcher!(CharMatcher, immutable(char)));
4696     static assert(isUtfMatcher!(WcharMatcher, wchar));
4697     static assert(isUtfMatcher!(WcharMatcher, immutable(wchar)));
4698 }
4699 
4700 enum Mode {
4701     alwaysSkip,
4702     neverSkip,
4703     skipOnMatch
4704 }
4705 
4706 mixin template ForwardStrings()
4707 {
4708     private bool fwdStr(string fn, C)(ref C[] str) const @trusted
4709     {
4710         import std.utf : byCodeUnit;
4711         alias type = typeof(byCodeUnit(str));
4712         return mixin(fn~"(*cast(type*)&str)");
4713     }
4714 }
4715 
4716 template Utf8Matcher()
4717 {
4718     enum validSize(int sz) = sz >= 1 && sz <= 4;
4719 
4720     void badEncoding() pure @safe
4721     {
4722         import std.utf : UTFException;
4723         throw new UTFException("Invalid UTF-8 sequence");
4724     }
4725 
4726     //for 1-stage ASCII
4727     alias AsciiSpec = AliasSeq!(bool, char, clamp!7);
4728     //for 2-stage lookup of 2 byte UTF-8 sequences
4729     alias Utf8Spec2 = AliasSeq!(bool, char[2],
4730         clampIdx!(0, 5), clampIdx!(1, 6));
4731     //ditto for 3 byte
4732     alias Utf8Spec3 = AliasSeq!(bool, char[3],
4733         clampIdx!(0, 4),
4734         clampIdx!(1, 6),
4735         clampIdx!(2, 6)
4736     );
4737     //ditto for 4 byte
4738     alias Utf8Spec4 = AliasSeq!(bool, char[4],
4739         clampIdx!(0, 3), clampIdx!(1, 6),
4740         clampIdx!(2, 6), clampIdx!(3, 6)
4741     );
4742     alias Tables = AliasSeq!(
4743         typeof(TrieBuilder!(AsciiSpec)(false).build()),
4744         typeof(TrieBuilder!(Utf8Spec2)(false).build()),
4745         typeof(TrieBuilder!(Utf8Spec3)(false).build()),
4746         typeof(TrieBuilder!(Utf8Spec4)(false).build())
4747     );
4748     alias Table(int size) = Tables[size-1];
4749 
4750     enum leadMask(size_t size) = (cast(size_t) 1<<(7 - size))-1;
4751     enum encMask(size_t size) = ((1 << size)-1)<<(8-size);
4752 
4753     char truncate()(char ch) pure @safe
4754     {
4755         ch -= 0x80;
4756         if (ch < 0x40)
4757         {
4758             return ch;
4759         }
4760         else
4761         {
4762             badEncoding();
4763             return cast(char) 0;
4764         }
4765     }
4766 
4767     static auto encode(size_t sz)(dchar ch)
4768         if (sz > 1)
4769     {
4770         import std.utf : encodeUTF = encode;
4771         char[4] buf;
4772         encodeUTF(buf, ch);
4773         char[sz] ret;
4774         buf[0] &= leadMask!sz;
4775         foreach (n; 1 .. sz)
4776             buf[n] = buf[n] & 0x3f; //keep 6 lower bits
4777         ret[] = buf[0 .. sz];
4778         return ret;
4779     }
4780 
4781     auto build(Set)(Set set)
4782     {
4783         import std.algorithm.iteration : map;
4784         auto ascii = set & unicode.ASCII;
4785         auto utf8_2 = set & CodepointSet(0x80, 0x800);
4786         auto utf8_3 = set & CodepointSet(0x800, 0x1_0000);
4787         auto utf8_4 = set & CodepointSet(0x1_0000, lastDchar+1);
4788         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
4789         auto utf8_2T = utf8_2.byCodepoint.map!(x=>encode!2(x)).buildTrie!(Utf8Spec2);
4790         auto utf8_3T = utf8_3.byCodepoint.map!(x=>encode!3(x)).buildTrie!(Utf8Spec3);
4791         auto utf8_4T = utf8_4.byCodepoint.map!(x=>encode!4(x)).buildTrie!(Utf8Spec4);
4792         alias Ret = Impl!(1,2,3,4);
4793         return Ret(asciiT, utf8_2T, utf8_3T, utf8_4T);
4794     }
4795 
4796     // Bootstrap UTF-8 static matcher interface
4797     // from 3 primitives: tab!(size), lookup and Sizes
4798     mixin template DefMatcher()
4799     {
4800         import std.format : format;
4801         import std.meta : Erase, staticIndexOf;
4802         enum hasASCII = staticIndexOf!(1, Sizes) >= 0;
4803         alias UniSizes = Erase!(1, Sizes);
4804 
4805         //generate dispatch code sequence for unicode parts
4806         static auto genDispatch()
4807         {
4808             string code;
4809             foreach (size; UniSizes)
4810                 code ~= format(q{
4811                     if ((ch & ~leadMask!%d) == encMask!(%d))
4812                         return lookup!(%d, mode)(inp);
4813                     else
4814                 }, size, size, size);
4815             static if (Sizes.length == 4) //covers all code unit cases
4816                 code ~= "{ badEncoding(); return false; }";
4817             else
4818                 code ~= "return false;"; //may be just fine but not covered
4819             return code;
4820         }
4821         enum dispatch = genDispatch();
4822 
4823         public bool match(Range)(ref Range inp) const
4824             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4825                 !isDynamicArray!Range)
4826         {
4827             enum mode = Mode.skipOnMatch;
4828             assert(!inp.empty);
4829             immutable ch = inp[0];
4830             static if (hasASCII)
4831             {
4832                 if (ch < 0x80)
4833                 {
4834                     immutable r = tab!1[ch];
4835                     if (r)
4836                         inp.popFront();
4837                     return r;
4838                 }
4839                 else
4840                     mixin(dispatch);
4841             }
4842             else
4843                 mixin(dispatch);
4844         }
4845 
4846         static if (Sizes.length == 4) // can skip iff can detect all encodings
4847         {
4848             public bool skip(Range)(ref Range inp) const
4849                 if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4850                     !isDynamicArray!Range)
4851             {
4852                 enum mode = Mode.alwaysSkip;
4853                 assert(!inp.empty);
4854                 auto ch = inp[0];
4855                 static if (hasASCII)
4856                 {
4857                     if (ch < 0x80)
4858                     {
4859                         inp.popFront();
4860                         return tab!1[ch];
4861                     }
4862                     else
4863                         mixin(dispatch);
4864                 }
4865                 else
4866                     mixin(dispatch);
4867             }
4868         }
4869 
4870         public bool test(Range)(ref Range inp) const
4871             if (isRandomAccessRange!Range && is(ElementType!Range : char) &&
4872                 !isDynamicArray!Range)
4873         {
4874             enum mode = Mode.neverSkip;
4875             assert(!inp.empty);
4876             auto ch = inp[0];
4877 
4878             static if (hasASCII)
4879             {
4880                 if (ch < 0x80)
4881                     return tab!1[ch];
4882                 else
4883                     mixin(dispatch);
4884             }
4885             else
4886                 mixin(dispatch);
4887         }
4888 
4889         bool match(C)(ref C[] str) const
4890             if (isSomeChar!C)
4891         {
4892             return fwdStr!"match"(str);
4893         }
4894 
4895         bool skip(C)(ref C[] str) const
4896             if (isSomeChar!C)
4897         {
4898             return fwdStr!"skip"(str);
4899         }
4900 
4901         bool test(C)(ref C[] str) const
4902             if (isSomeChar!C)
4903         {
4904             return fwdStr!"test"(str);
4905         }
4906 
4907         mixin ForwardStrings;
4908     }
4909 
4910     struct Impl(Sizes...)
4911     {
4912         import std.meta : allSatisfy, staticMap;
4913         static assert(allSatisfy!(validSize, Sizes),
4914             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4915     private:
4916         //pick tables for chosen sizes
4917         alias OurTabs = staticMap!(Table, Sizes);
4918         OurTabs tables;
4919         mixin DefMatcher;
4920         //static disptach helper UTF size ==> table
4921         alias tab(int i) = tables[i - 1];
4922 
4923         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
4924         {
4925             return CherryPick!(Impl, SizesToPick)(&this);
4926         }
4927 
4928         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4929         {
4930             import std.range : popFrontN;
4931             if (inp.length < size)
4932             {
4933                 badEncoding();
4934                 return false;
4935             }
4936             char[size] needle = void;
4937             needle[0] = leadMask!size & inp[0];
4938             static foreach (i; 1 .. size)
4939             {
4940                 needle[i] = truncate(inp[i]);
4941             }
4942             //overlong encoding checks
4943             static if (size == 2)
4944             {
4945                 //0x80-0x7FF
4946                 //got 6 bits in needle[1], must use at least 8 bits
4947                 //must use at least 2 bits in needle[1]
4948                 if (needle[0] < 2) badEncoding();
4949             }
4950             else static if (size == 3)
4951             {
4952                 //0x800-0xFFFF
4953                 //got 6 bits in needle[2], must use at least 12bits
4954                 //must use 6 bits in needle[1] or anything in needle[0]
4955                 if (needle[0] == 0 && needle[1] < 0x20) badEncoding();
4956             }
4957             else static if (size == 4)
4958             {
4959                 //0x800-0xFFFF
4960                 //got 2x6=12 bits in needle[2 .. 3] must use at least 17bits
4961                 //must use 5 bits (or above) in needle[1] or anything in needle[0]
4962                 if (needle[0] == 0 && needle[1] < 0x10) badEncoding();
4963             }
4964             static if (mode == Mode.alwaysSkip)
4965             {
4966                 inp.popFrontN(size);
4967                 return tab!size[needle];
4968             }
4969             else static if (mode == Mode.neverSkip)
4970             {
4971                 return tab!size[needle];
4972             }
4973             else
4974             {
4975                 static assert(mode == Mode.skipOnMatch);
4976 
4977                 if (tab!size[needle])
4978                 {
4979                     inp.popFrontN(size);
4980                     return true;
4981                 }
4982                 else
4983                     return false;
4984             }
4985         }
4986     }
4987 
4988     struct CherryPick(I, Sizes...)
4989     {
4990         import std.meta : allSatisfy;
4991         static assert(allSatisfy!(validSize, Sizes),
4992             "Only lengths of 1, 2, 3 and 4 code unit are possible for UTF-8");
4993     private:
4994         I* m;
4995         @property auto tab(int i)() const { return m.tables[i - 1]; }
4996         bool lookup(int size, Mode mode, Range)(ref Range inp) const
4997         {
4998             return m.lookup!(size, mode)(inp);
4999         }
5000         mixin DefMatcher;
5001     }
5002 }
5003 
5004 template Utf16Matcher()
5005 {
5006     enum validSize(int sz) = sz >= 1 && sz <= 2;
5007 
5008     void badEncoding() pure @safe
5009     {
5010         import std.utf : UTFException;
5011         throw new UTFException("Invalid UTF-16 sequence");
5012     }
5013 
5014     // 1-stage ASCII
5015     alias AsciiSpec = AliasSeq!(bool, wchar, clamp!7);
5016     //2-stage BMP
5017     alias BmpSpec = AliasSeq!(bool, wchar, sliceBits!(7, 16), sliceBits!(0, 7));
5018     //4-stage - full Unicode
5019     //assume that 0xD800 & 0xDC00 bits are cleared
5020     //thus leaving 10 bit per wchar to worry about
5021     alias UniSpec = AliasSeq!(bool, wchar[2],
5022         assumeSize!(x=>x[0]>>4, 6), assumeSize!(x=>x[0]&0xf, 4),
5023         assumeSize!(x=>x[1]>>6, 4), assumeSize!(x=>x[1]&0x3f, 6),
5024     );
5025     alias Ascii = typeof(TrieBuilder!(AsciiSpec)(false).build());
5026     alias Bmp = typeof(TrieBuilder!(BmpSpec)(false).build());
5027     alias Uni = typeof(TrieBuilder!(UniSpec)(false).build());
5028 
5029     auto encode2(dchar ch)
5030     {
5031         ch -= 0x1_0000;
5032         assert(ch <= 0xF_FFFF);
5033         wchar[2] ret;
5034         //do not put surrogate bits, they are sliced off
5035         ret[0] = cast(wchar)(ch >> 10);
5036         ret[1] = (ch & 0xFFF);
5037         return ret;
5038     }
5039 
5040     auto build(Set)(Set set)
5041     {
5042         import std.algorithm.iteration : map;
5043         auto ascii = set & unicode.ASCII;
5044         auto bmp = (set & CodepointSet.fromIntervals(0x80, 0xFFFF+1))
5045             - CodepointSet.fromIntervals(0xD800, 0xDFFF+1);
5046         auto other = set - (bmp | ascii);
5047         auto asciiT = ascii.byCodepoint.map!(x=>cast(char) x).buildTrie!(AsciiSpec);
5048         auto bmpT = bmp.byCodepoint.map!(x=>cast(wchar) x).buildTrie!(BmpSpec);
5049         auto otherT = other.byCodepoint.map!(x=>encode2(x)).buildTrie!(UniSpec);
5050         alias Ret = Impl!(1,2);
5051         return Ret(asciiT, bmpT, otherT);
5052     }
5053 
5054     //bootstrap full UTF-16 matcher interace from
5055     //sizeFlags, lookupUni and ascii
5056     mixin template DefMatcher()
5057     {
5058         public bool match(Range)(ref Range inp) const
5059             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5060                 !isDynamicArray!Range)
5061         {
5062             enum mode = Mode.skipOnMatch;
5063             assert(!inp.empty);
5064             immutable ch = inp[0];
5065             static if (sizeFlags & 1)
5066             {
5067                 if (ch < 0x80)
5068                 {
5069                   if (ascii[ch])
5070                   {
5071                       inp.popFront();
5072                       return true;
5073                   }
5074                   else
5075                       return false;
5076                 }
5077                 return lookupUni!mode(inp);
5078             }
5079             else
5080                 return lookupUni!mode(inp);
5081         }
5082 
5083         static if (Sizes.length == 2)
5084         {
5085             public bool skip(Range)(ref Range inp) const
5086                 if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5087                     !isDynamicArray!Range)
5088             {
5089                 enum mode = Mode.alwaysSkip;
5090                 assert(!inp.empty);
5091                 immutable ch = inp[0];
5092                 static if (sizeFlags & 1)
5093                 {
5094                     if (ch < 0x80)
5095                     {
5096                         inp.popFront();
5097                         return ascii[ch];
5098                     }
5099                     else
5100                         return lookupUni!mode(inp);
5101                 }
5102                 else
5103                     return lookupUni!mode(inp);
5104             }
5105         }
5106 
5107         public bool test(Range)(ref Range inp) const
5108             if (isRandomAccessRange!Range && is(ElementType!Range : wchar) &&
5109                 !isDynamicArray!Range)
5110         {
5111             enum mode = Mode.neverSkip;
5112             assert(!inp.empty);
5113             auto ch = inp[0];
5114             static if (sizeFlags & 1)
5115                 return ch < 0x80 ? ascii[ch] : lookupUni!mode(inp);
5116             else
5117                 return lookupUni!mode(inp);
5118         }
5119 
5120         bool match(C)(ref C[] str) const
5121             if (isSomeChar!C)
5122         {
5123             return fwdStr!"match"(str);
5124         }
5125 
5126         bool skip(C)(ref C[] str) const
5127             if (isSomeChar!C)
5128         {
5129             return fwdStr!"skip"(str);
5130         }
5131 
5132         bool test(C)(ref C[] str) const
5133             if (isSomeChar!C)
5134         {
5135             return fwdStr!"test"(str);
5136         }
5137 
5138         mixin ForwardStrings; //dispatch strings to range versions
5139     }
5140 
5141     struct Impl(Sizes...)
5142         if (Sizes.length >= 1 && Sizes.length <= 2)
5143     {
5144     private:
5145         import std.meta : allSatisfy;
5146         static assert(allSatisfy!(validSize, Sizes),
5147             "Only lengths of 1 and 2 code units are possible in UTF-16");
5148         static if (Sizes.length > 1)
5149             enum sizeFlags = Sizes[0] | Sizes[1];
5150         else
5151             enum sizeFlags = Sizes[0];
5152 
5153         static if (sizeFlags & 1)
5154         {
5155             Ascii ascii;
5156             Bmp bmp;
5157         }
5158         static if (sizeFlags & 2)
5159         {
5160             Uni uni;
5161         }
5162         mixin DefMatcher;
5163 
5164         package(std) @property CherryPick!(Impl, SizesToPick) subMatcher(SizesToPick...)()
5165         {
5166             return CherryPick!(Impl, SizesToPick)(&this);
5167         }
5168 
5169         bool lookupUni(Mode mode, Range)(ref Range inp) const
5170         {
5171             wchar x = cast(wchar)(inp[0] - 0xD800);
5172             //not a high surrogate
5173             if (x > 0x3FF)
5174             {
5175                 //low surrogate
5176                 if (x <= 0x7FF) badEncoding();
5177                 static if (sizeFlags & 1)
5178                 {
5179                     auto ch = inp[0];
5180                     static if (mode == Mode.alwaysSkip)
5181                         inp.popFront();
5182                     static if (mode == Mode.skipOnMatch)
5183                     {
5184                         if (bmp[ch])
5185                         {
5186                             inp.popFront();
5187                             return true;
5188                         }
5189                         else
5190                             return false;
5191                     }
5192                     else
5193                         return bmp[ch];
5194                 }
5195                 else //skip is not available for sub-matchers, so just false
5196                     return false;
5197             }
5198             else
5199             {
5200                 import std.range : popFrontN;
5201                 static if (sizeFlags & 2)
5202                 {
5203                     if (inp.length < 2)
5204                         badEncoding();
5205                     wchar y = cast(wchar)(inp[1] - 0xDC00);
5206                     //not a low surrogate
5207                     if (y > 0x3FF)
5208                         badEncoding();
5209                     wchar[2] needle = [inp[0] & 0x3ff, inp[1] & 0x3ff];
5210                     static if (mode == Mode.alwaysSkip)
5211                         inp.popFrontN(2);
5212                     static if (mode == Mode.skipOnMatch)
5213                     {
5214                         if (uni[needle])
5215                         {
5216                             inp.popFrontN(2);
5217                             return true;
5218                         }
5219                         else
5220                             return false;
5221                     }
5222                     else
5223                         return uni[needle];
5224                 }
5225                 else //ditto
5226                     return false;
5227             }
5228         }
5229     }
5230 
5231     struct CherryPick(I, Sizes...)
5232         if (Sizes.length >= 1 && Sizes.length <= 2)
5233     {
5234     private:
5235         import std.meta : allSatisfy;
5236         I* m;
5237         enum sizeFlags = I.sizeFlags;
5238 
5239         static if (sizeFlags & 1)
5240         {
5241             @property auto ascii()() const { return m.ascii; }
5242         }
5243 
5244         bool lookupUni(Mode mode, Range)(ref Range inp) const
5245         {
5246             return m.lookupUni!mode(inp);
5247         }
5248         mixin DefMatcher;
5249         static assert(allSatisfy!(validSize, Sizes),
5250             "Only lengths of 1 and 2 code units are possible in UTF-16");
5251     }
5252 }
5253 
5254 private auto utf8Matcher(Set)(Set set)
5255 {
5256     return Utf8Matcher!().build(set);
5257 }
5258 
5259 private auto utf16Matcher(Set)(Set set)
5260 {
5261     return Utf16Matcher!().build(set);
5262 }
5263 
5264 /**
5265     Constructs a matcher object
5266     to classify $(CODEPOINTS) from the `set` for encoding
5267     that has `Char` as code unit.
5268 
5269     See $(LREF MatcherConcept) for API outline.
5270 */
5271 public auto utfMatcher(Char, Set)(Set set)
5272 if (isCodepointSet!Set)
5273 {
5274     static if (is(Char : char))
5275         return utf8Matcher(set);
5276     else static if (is(Char : wchar))
5277         return utf16Matcher(set);
5278     else static if (is(Char : dchar))
5279         static assert(false, "UTF-32 needs no decoding,
5280             and thus not supported by utfMatcher");
5281     else
5282         static assert(false, "Only character types 'char' and 'wchar' are allowed");
5283 }
5284 
5285 
5286 //a range of code units, packed with index to speed up forward iteration
5287 package(std) auto decoder(C)(C[] s, size_t offset=0)
5288 if (is(C : wchar) || is(C : char))
5289 {
5290     static struct Decoder
5291     {
5292     pure nothrow:
5293         C[] str;
5294         size_t idx;
5295         @property C front(){ return str[idx]; }
5296         @property C back(){ return str[$-1]; }
5297         void popFront(){ idx++; }
5298         void popBack(){ str = str[0..$-1]; }
5299         void popFrontN(size_t n){ idx += n; }
5300         @property bool empty(){ return idx == str.length; }
5301         @property auto save(){ return this; }
5302         auto opIndex(size_t i){ return str[idx+i]; }
5303         @property size_t length(){ return str.length - idx; }
5304         alias opDollar = length;
5305         auto opSlice(size_t a, size_t b){ return Decoder(str[0 .. idx+b], idx+a); }
5306     }
5307     static assert(isRandomAccessRange!Decoder);
5308     static assert(is(ElementType!Decoder : C));
5309     return Decoder(s, offset);
5310 }
5311 
5312 pure @safe unittest
5313 {
5314     string rs = "hi! ネемног砀 текста";
5315     auto codec = rs.decoder;
5316     auto utf8 =  utf8Matcher(unicode.Letter);
5317     auto asc = utf8.subMatcher!(1);
5318     auto uni = utf8.subMatcher!(2,3,4);
5319 
5320     // h
5321     assert(asc.test(codec));
5322     assert(!uni.match(codec));
5323     assert(utf8.skip(codec));
5324     assert(codec.idx == 1);
5325 
5326     // i
5327     assert(asc.test(codec));
5328     assert(!uni.match(codec));
5329     assert(utf8.skip(codec));
5330     assert(codec.idx == 2);
5331 
5332     // !
5333     assert(!asc.match(codec));
5334     assert(!utf8.test(codec));
5335     assert(!utf8.skip(codec));
5336     assert(codec.idx == 3);
5337 
5338     // space
5339     assert(!asc.test(codec));
5340     assert(!utf8.test(codec));
5341     assert(!utf8.skip(codec));
5342     assert(codec.idx == 4);
5343 
5344     assert(utf8.test(codec));
5345     foreach (i; 0 .. 7)
5346     {
5347         assert(!asc.test(codec));
5348         assert(uni.test(codec));
5349         assert(utf8.skip(codec));
5350     }
5351     assert(!utf8.test(codec));
5352     assert(!utf8.skip(codec));
5353 
5354     //the same with match where applicable
5355     codec = rs.decoder;
5356     assert(utf8.match(codec));
5357     assert(codec.idx == 1);
5358     assert(utf8.match(codec));
5359     assert(codec.idx == 2);
5360     assert(!utf8.match(codec));
5361     assert(codec.idx == 2);
5362     assert(!utf8.skip(codec));
5363     assert(!utf8.skip(codec));
5364 
5365     foreach (i; 0 .. 7)
5366     {
5367         assert(!asc.test(codec));
5368         assert(utf8.test(codec));
5369         assert(utf8.match(codec));
5370     }
5371     auto i = codec.idx;
5372     assert(!utf8.match(codec));
5373     assert(codec.idx == i);
5374 }
5375 
5376 pure @system unittest
5377 {
5378     import std.range : stride;
5379     static bool testAll(Matcher, Range)(ref Matcher m, ref Range r) @safe
5380     {
5381         bool t = m.test(r);
5382         auto save = r.idx;
5383         assert(t == m.match(r));
5384         assert(r.idx == save || t); //ether no change or was match
5385         r.idx = save;
5386         static if (is(typeof(m.skip(r))))
5387         {
5388             assert(t == m.skip(r));
5389             assert(r.idx != save); //always changed
5390             r.idx = save;
5391         }
5392         return t;
5393     }
5394     auto utf16 = utfMatcher!wchar(unicode.L);
5395     auto bmp = utf16.subMatcher!1;
5396     auto nonBmp = utf16.subMatcher!1;
5397     auto utf8 = utfMatcher!char(unicode.L);
5398     auto ascii = utf8.subMatcher!1;
5399     auto uni2 = utf8.subMatcher!2;
5400     auto uni3 = utf8.subMatcher!3;
5401     auto uni24 = utf8.subMatcher!(2,4);
5402     foreach (ch; unicode.L.byCodepoint.stride(3))
5403     {
5404         import std.utf : encode;
5405         char[4] buf;
5406         wchar[2] buf16;
5407         auto len = encode(buf, ch);
5408         auto len16 = encode(buf16, ch);
5409         auto c8 = buf[0 .. len].decoder;
5410         auto c16 = buf16[0 .. len16].decoder;
5411         assert(testAll(utf16, c16));
5412         assert(testAll(bmp, c16) || len16 != 1);
5413         assert(testAll(nonBmp, c16) || len16 != 2);
5414 
5415         assert(testAll(utf8, c8));
5416 
5417         //submatchers return false on out of their domain
5418         assert(testAll(ascii, c8) || len != 1);
5419         assert(testAll(uni2, c8) || len != 2);
5420         assert(testAll(uni3, c8) || len != 3);
5421         assert(testAll(uni24, c8) || (len != 2 && len != 4));
5422     }
5423 }
5424 
5425 // cover decode fail cases of Matcher
5426 pure @safe unittest
5427 {
5428     import std.algorithm.iteration : map;
5429     import std.exception : collectException;
5430     import std.format : format;
5431     auto utf16 = utfMatcher!wchar(unicode.L);
5432     auto utf8 = utfMatcher!char(unicode.L);
5433     //decode failure cases UTF-8
5434     alias fails8 = AliasSeq!("\xC1", "\x80\x00","\xC0\x00", "\xCF\x79",
5435         "\xFF\x00\0x00\0x00\x00", "\xC0\0x80\0x80\x80", "\x80\0x00\0x00\x00",
5436         "\xCF\x00\0x00\0x00\x00");
5437     foreach (msg; fails8)
5438     {
5439         assert(collectException((){
5440             auto s = msg;
5441             size_t idx = 0;
5442             utf8.test(s);
5443         }()), format("%( %2x %)", cast(immutable(ubyte)[]) msg));
5444     }
5445     //decode failure cases UTF-16
5446     alias fails16 = AliasSeq!([0xD811], [0xDC02]);
5447     foreach (msg; fails16)
5448     {
5449         assert(collectException((){
5450             auto s = msg.map!(x => cast(wchar) x);
5451             utf16.test(s);
5452         }()));
5453     }
5454 }
5455 
5456 /++
5457     Convenience function to construct optimal configurations for
5458     packed Trie from any `set` of $(CODEPOINTS).
5459 
5460     The parameter `level` indicates the number of trie levels to use,
5461     allowed values are: 1, 2, 3 or 4. Levels represent different trade-offs
5462     speed-size wise.
5463 
5464     $(P Level 1 is fastest and the most memory hungry (a bit array). )
5465     $(P Level 4 is the slowest and has the smallest footprint. )
5466 
5467     See the $(S_LINK Synopsis, Synopsis) section for example.
5468 
5469     Note:
5470     Level 4 stays very practical (being faster and more predictable)
5471     compared to using direct lookup on the `set` itself.
5472 
5473 
5474 +/
5475 public auto toTrie(size_t level, Set)(Set set)
5476 if (isCodepointSet!Set)
5477 {
5478     static if (level == 1)
5479         return codepointSetTrie!(21)(set);
5480     else static if (level == 2)
5481         return codepointSetTrie!(10, 11)(set);
5482     else static if (level == 3)
5483         return codepointSetTrie!(8, 5, 8)(set);
5484     else static if (level == 4)
5485          return codepointSetTrie!(6, 4, 4, 7)(set);
5486     else
5487         static assert(false,
5488             "Sorry, toTrie doesn't support levels > 4, use codepointSetTrie directly");
5489 }
5490 
5491 /**
5492     $(P Builds a `Trie` with typically optimal speed-size trade-off
5493     and wraps it into a delegate of the following type:
5494     $(D bool delegate(dchar ch)). )
5495 
5496     $(P Effectively this creates a 'tester' lambda suitable
5497     for algorithms like std.algorithm.find that take unary predicates. )
5498 
5499     See the $(S_LINK Synopsis, Synopsis) section for example.
5500 */
5501 public auto toDelegate(Set)(Set set)
5502 if (isCodepointSet!Set)
5503 {
5504     // 3 is very small and is almost as fast as 2-level (due to CPU caches?)
5505     auto t = toTrie!3(set);
5506     return (dchar ch) => t[ch];
5507 }
5508 
5509 /**
5510     $(P Opaque wrapper around unsigned built-in integers and
5511     code unit (char/wchar/dchar) types.
5512     Parameter `sz` indicates that the value is confined
5513     to the range of [0, 2^^sz$(RPAREN). With this knowledge it can be
5514     packed more tightly when stored in certain
5515     data-structures like trie. )
5516 
5517     Note:
5518     $(P The $(D BitPacked!(T, sz)) is implicitly convertible to `T`
5519     but not vise-versa. Users have to ensure the value fits in
5520     the range required and use the `cast`
5521     operator to perform the conversion.)
5522 */
5523 struct BitPacked(T, size_t sz)
5524 if (isIntegral!T || is(T:dchar))
5525 {
5526     enum bitSize = sz;
5527     T _value;
5528     alias _value this;
5529 }
5530 
5531 /*
5532     Depending on the form of the passed argument `bitSizeOf` returns
5533     the amount of bits required to represent a given type
5534     or a return type of a given functor.
5535 */
5536 template bitSizeOf(Args...)
5537 if (Args.length == 1)
5538 {
5539     import std.traits : ReturnType;
5540     alias T = Args[0];
5541     static if (__traits(compiles, { size_t val = T.bitSize; })) //(is(typeof(T.bitSize) : size_t))
5542     {
5543         enum bitSizeOf = T.bitSize;
5544     }
5545     else static if (is(ReturnType!T dummy == BitPacked!(U, bits), U, size_t bits))
5546     {
5547         enum bitSizeOf = bitSizeOf!(ReturnType!T);
5548     }
5549     else
5550     {
5551         enum bitSizeOf = T.sizeof*8;
5552     }
5553 }
5554 
5555 /**
5556     Tests if `T` is some instantiation of $(LREF BitPacked)!(U, x)
5557     and thus suitable for packing.
5558 */
5559 template isBitPacked(T)
5560 {
5561     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5562         enum isBitPacked = true;
5563     else
5564         enum isBitPacked = false;
5565 }
5566 
5567 /**
5568     Gives the type `U` from $(LREF BitPacked)!(U, x)
5569     or `T` itself for every other type.
5570 */
5571 template TypeOfBitPacked(T)
5572 {
5573     static if (is(T dummy == BitPacked!(U, bits), U, size_t bits))
5574         alias TypeOfBitPacked = U;
5575     else
5576         alias TypeOfBitPacked = T;
5577 }
5578 
5579 /*
5580     Wrapper, used in definition of custom data structures from `Trie` template.
5581     Applying it to a unary lambda function indicates that the returned value always
5582     fits within `bits` of bits.
5583 */
5584 struct assumeSize(alias Fn, size_t bits)
5585 {
5586     enum bitSize = bits;
5587     static auto ref opCall(T)(auto ref T arg)
5588     {
5589         return Fn(arg);
5590     }
5591 }
5592 
5593 /*
5594     A helper for defining lambda function that yields a slice
5595     of certain bits from an unsigned integral value.
5596     The resulting lambda is wrapped in assumeSize and can be used directly
5597     with `Trie` template.
5598 */
5599 struct sliceBits(size_t from, size_t to)
5600 {
5601     //for now bypass assumeSize, DMD has trouble inlining it
5602     enum bitSize = to-from;
5603     static auto opCall(T)(T x)
5604     out(result)
5605     {
5606         assert(result < (1 << to-from));
5607     }
5608     do
5609     {
5610         static assert(from < to);
5611         static if (from == 0)
5612             return x & ((1 << to)-1);
5613         else
5614         return (x >> from) & ((1<<(to-from))-1);
5615     }
5616 }
5617 
5618 @safe pure nothrow @nogc uint low_8(uint x) { return x&0xFF; }
5619 @safe pure nothrow @nogc uint midlow_8(uint x){ return (x&0xFF00)>>8; }
5620 alias lo8 = assumeSize!(low_8, 8);
5621 alias mlo8 = assumeSize!(midlow_8, 8);
5622 
5623 @safe pure nothrow @nogc unittest
5624 {
5625     static assert(bitSizeOf!lo8 == 8);
5626     static assert(bitSizeOf!(sliceBits!(4, 7)) == 3);
5627     static assert(bitSizeOf!(BitPacked!(uint, 2)) == 2);
5628 }
5629 
5630 template Sequence(size_t start, size_t end)
5631 {
5632     static if (start < end)
5633         alias Sequence = AliasSeq!(start, Sequence!(start+1, end));
5634     else
5635         alias Sequence = AliasSeq!();
5636 }
5637 
5638 //---- TRIE TESTS ----
5639 @system unittest
5640 {
5641     import std.algorithm.iteration : map;
5642     import std.algorithm.sorting : sort;
5643     import std.array : array;
5644     import std.conv : text, to;
5645     import std.range : iota;
5646     static trieStats(TRIE)(TRIE t)
5647     {
5648         version (std_uni_stats)
5649         {
5650             import std.stdio : writefln, writeln;
5651             writeln("---TRIE FOOTPRINT STATS---");
5652             static foreach (i; 0 .. t.table.dim)
5653             {
5654                 writefln("lvl%s = %s bytes;  %s pages"
5655                          , i, t.bytes!i, t.pages!i);
5656             }
5657             writefln("TOTAL: %s bytes", t.bytes);
5658             version (none)
5659             {
5660                 writeln("INDEX (excluding value level):");
5661                 static foreach (i; 0 .. t.table.dim-1)
5662                     writeln(t.table.slice!(i)[0 .. t.table.length!i]);
5663             }
5664             writeln("---------------------------");
5665         }
5666     }
5667     //@@@BUG link failure, lambdas not found by linker somehow (in case of trie2)
5668     // alias lo8   = assumeSize!(8, function (uint x) { return x&0xFF; });
5669     // alias next8 = assumeSize!(7, function (uint x) { return (x&0x7F00)>>8; });
5670     alias Set = CodepointSet;
5671     auto set = Set('A','Z','a','z');
5672     auto trie = buildTrie!(bool, uint, 256, lo8)(set.byInterval);// simple bool array
5673     for (int a='a'; a<'z';a++)
5674         assert(trie[a]);
5675     for (int a='A'; a<'Z';a++)
5676         assert(trie[a]);
5677     for (int a=0; a<'A'; a++)
5678         assert(!trie[a]);
5679     for (int a ='Z'; a<'a'; a++)
5680         assert(!trie[a]);
5681     trieStats(trie);
5682 
5683     auto redundant2 = Set(
5684         1, 18, 256+2, 256+111, 512+1, 512+18, 768+2, 768+111);
5685     auto trie2 = buildTrie!(bool, uint, 1024, mlo8, lo8)(redundant2.byInterval);
5686     trieStats(trie2);
5687     foreach (e; redundant2.byCodepoint)
5688         assert(trie2[e], text(cast(uint) e, " - ", trie2[e]));
5689     foreach (i; 0 .. 1024)
5690     {
5691         assert(trie2[i] == (i in redundant2));
5692     }
5693 
5694 
5695     auto redundant3 = Set(
5696           2,    4,    6,    8,    16,
5697        2+16, 4+16, 16+6, 16+8, 16+16,
5698        2+32, 4+32, 32+6, 32+8,
5699       );
5700 
5701     enum max3 = 256;
5702     // sliceBits
5703     auto trie3 = buildTrie!(bool, uint, max3,
5704             sliceBits!(6,8), sliceBits!(4,6), sliceBits!(0,4)
5705         )(redundant3.byInterval);
5706     trieStats(trie3);
5707     foreach (i; 0 .. max3)
5708         assert(trie3[i] == (i in redundant3), text(cast(uint) i));
5709 
5710     auto redundant4 = Set(
5711             10, 64, 64+10, 128, 128+10, 256, 256+10, 512,
5712             1000, 2000, 3000, 4000, 5000, 6000
5713         );
5714     enum max4 = 2^^16;
5715     auto trie4 = buildTrie!(bool, size_t, max4,
5716             sliceBits!(13, 16), sliceBits!(9, 13), sliceBits!(6, 9) , sliceBits!(0, 6)
5717         )(redundant4.byInterval);
5718     foreach (i; 0 .. max4)
5719     {
5720         if (i in redundant4)
5721             assert(trie4[i], text(cast(uint) i));
5722     }
5723     trieStats(trie4);
5724 
5725         alias mapToS = mapTrieIndex!(useItemAt!(0, char));
5726         string[] redundantS = ["tea", "start", "orange"];
5727         redundantS.sort!((a,b) => mapToS(a) < mapToS(b))();
5728         auto strie = buildTrie!(bool, string, useItemAt!(0, char))(redundantS);
5729         // using first char only
5730         assert(redundantS == ["orange", "start", "tea"]);
5731         assert(strie["test"], text(strie["test"]));
5732         assert(!strie["aea"]);
5733         assert(strie["s"]);
5734 
5735     // a bit size test
5736     auto a = array(map!(x => to!ubyte(x))(iota(0, 256)));
5737     auto bt = buildTrie!(bool, ubyte, sliceBits!(7, 8), sliceBits!(5, 7), sliceBits!(0, 5))(a);
5738     trieStats(bt);
5739     foreach (i; 0 .. 256)
5740         assert(bt[cast(ubyte) i]);
5741 }
5742 
5743 template useItemAt(size_t idx, T)
5744 if (isIntegral!T || is(T: dchar))
5745 {
5746     size_t impl(const scope T[] arr){ return arr[idx]; }
5747     alias useItemAt = assumeSize!(impl, 8*T.sizeof);
5748 }
5749 
5750 template useLastItem(T)
5751 {
5752     size_t impl(const scope T[] arr){ return arr[$-1]; }
5753     alias useLastItem = assumeSize!(impl, 8*T.sizeof);
5754 }
5755 
5756 template fullBitSize(Prefix...)
5757 {
5758     static if (Prefix.length > 0)
5759         enum fullBitSize = bitSizeOf!(Prefix[0])+fullBitSize!(Prefix[1..$]);
5760     else
5761         enum fullBitSize = 0;
5762 }
5763 
5764 template idxTypes(Key, size_t fullBits, Prefix...)
5765 {
5766     static if (Prefix.length == 1)
5767     {// the last level is value level, so no index once reduced to 1-level
5768         alias idxTypes = AliasSeq!();
5769     }
5770     else
5771     {
5772         // Important note on bit packing
5773         // Each level has to hold enough of bits to address the next one
5774         // The bottom level is known to hold full bit width
5775         // thus it's size in pages is full_bit_width - size_of_last_prefix
5776         // Recourse on this notion
5777         alias idxTypes =
5778             AliasSeq!(
5779                 idxTypes!(Key, fullBits - bitSizeOf!(Prefix[$-1]), Prefix[0..$-1]),
5780                 BitPacked!(typeof(Prefix[$-2](Key.init)), fullBits - bitSizeOf!(Prefix[$-1]))
5781             );
5782     }
5783 }
5784 
5785 //============================================================================
5786 
5787 @safe pure int comparePropertyName(Char1, Char2)(const(Char1)[] a, const(Char2)[] b)
5788 if (is(Char1 : dchar) && is(Char2 : dchar))
5789 {
5790     import std.algorithm.comparison : cmp;
5791     import std.algorithm.iteration : map, filter;
5792     import std.ascii : toLower;
5793     static bool pred(dchar c) {return !c.isWhite && c != '-' && c != '_';}
5794     return cmp(
5795         a.map!toLower.filter!pred,
5796         b.map!toLower.filter!pred);
5797 }
5798 
5799 @safe pure unittest
5800 {
5801     assert(!comparePropertyName("foo-bar", "fooBar"));
5802 }
5803 
5804 bool propertyNameLess(Char1, Char2)(const(Char1)[] a, const(Char2)[] b) @safe pure
5805 if (is(Char1 : dchar) && is(Char2 : dchar))
5806 {
5807     return comparePropertyName(a, b) < 0;
5808 }
5809 
5810 //============================================================================
5811 // Utilities for compression of Unicode code point sets
5812 //============================================================================
5813 
5814 @safe void compressTo(uint val, ref scope ubyte[] arr) pure nothrow
5815 {
5816     // not optimized as usually done 1 time (and not public interface)
5817     if (val < 128)
5818         arr ~= cast(ubyte) val;
5819     else if (val < (1 << 13))
5820     {
5821         arr ~= (0b1_00 << 5) | cast(ubyte)(val >> 8);
5822         arr ~= val & 0xFF;
5823     }
5824     else
5825     {
5826         assert(val < (1 << 21));
5827         arr ~= (0b1_01 << 5) | cast(ubyte)(val >> 16);
5828         arr ~= (val >> 8) & 0xFF;
5829         arr ~= val  & 0xFF;
5830     }
5831 }
5832 
5833 @safe uint decompressFrom(scope const(ubyte)[] arr, ref size_t idx) pure
5834 {
5835     import std.exception : enforce;
5836     immutable first = arr[idx++];
5837     if (!(first & 0x80)) // no top bit -> [0 .. 127]
5838         return first;
5839     immutable extra = ((first >> 5) & 1) + 1; // [1, 2]
5840     uint val = (first & 0x1F);
5841     enforce(idx + extra <= arr.length, "bad code point interval encoding");
5842     foreach (j; 0 .. extra)
5843         val = (val << 8) | arr[idx+j];
5844     idx += extra;
5845     return val;
5846 }
5847 
5848 
5849 package(std) ubyte[] compressIntervals(Range)(Range intervals)
5850 if (isInputRange!Range && isIntegralPair!(ElementType!Range))
5851 {
5852     ubyte[] storage;
5853     uint base = 0;
5854     // RLE encode
5855     foreach (val; intervals)
5856     {
5857         compressTo(val[0]-base, storage);
5858         base = val[0];
5859         if (val[1] != lastDchar+1) // till the end of the domain so don't store it
5860         {
5861             compressTo(val[1]-base, storage);
5862             base = val[1];
5863         }
5864     }
5865     return storage;
5866 }
5867 
5868 @safe pure unittest
5869 {
5870     import std.algorithm.comparison : equal;
5871     import std.typecons : tuple;
5872 
5873     auto run = [tuple(80, 127), tuple(128, (1 << 10)+128)];
5874     ubyte[] enc = [cast(ubyte) 80, 47, 1, (0b1_00 << 5) | (1 << 2), 0];
5875     assert(compressIntervals(run) == enc);
5876     auto run2 = [tuple(0, (1 << 20)+512+1), tuple((1 << 20)+512+4, lastDchar+1)];
5877     ubyte[] enc2 = [cast(ubyte) 0, (0b1_01 << 5) | (1 << 4), 2, 1, 3]; // odd length-ed
5878     assert(compressIntervals(run2) == enc2);
5879     size_t  idx = 0;
5880     assert(decompressFrom(enc, idx) == 80);
5881     assert(decompressFrom(enc, idx) == 47);
5882     assert(decompressFrom(enc, idx) == 1);
5883     assert(decompressFrom(enc, idx) == (1 << 10));
5884     idx = 0;
5885     assert(decompressFrom(enc2, idx) == 0);
5886     assert(decompressFrom(enc2, idx) == (1 << 20)+512+1);
5887     assert(equal(decompressIntervals(compressIntervals(run)), run));
5888     assert(equal(decompressIntervals(compressIntervals(run2)), run2));
5889 }
5890 
5891 // Creates a range of `CodepointInterval` that lazily decodes compressed data.
5892 @safe package(std) auto decompressIntervals(const(ubyte)[] data) pure
5893 {
5894     return DecompressedIntervals(data);
5895 }
5896 
5897 @safe struct DecompressedIntervals
5898 {
5899 pure:
5900     const(ubyte)[] _stream;
5901     size_t _idx;
5902     CodepointInterval _front;
5903 
5904     this(const(ubyte)[] stream)
5905     {
5906         _stream = stream;
5907         popFront();
5908     }
5909 
5910     @property CodepointInterval front()
5911     {
5912         assert(!empty);
5913         return _front;
5914     }
5915 
5916     void popFront()
5917     {
5918         if (_idx == _stream.length)
5919         {
5920             _idx = size_t.max;
5921             return;
5922         }
5923         uint base = _front[1];
5924         _front[0] = base + decompressFrom(_stream, _idx);
5925         if (_idx == _stream.length)// odd length ---> till the end
5926             _front[1] = lastDchar+1;
5927         else
5928         {
5929             base = _front[0];
5930             _front[1] = base + decompressFrom(_stream, _idx);
5931         }
5932     }
5933 
5934     @property bool empty() const
5935     {
5936         return _idx == size_t.max;
5937     }
5938 
5939     @property DecompressedIntervals save() return scope { return this; }
5940 }
5941 
5942 @safe pure nothrow @nogc unittest
5943 {
5944     static assert(isInputRange!DecompressedIntervals);
5945     static assert(isForwardRange!DecompressedIntervals);
5946 }
5947 
5948 //============================================================================
5949 
5950 version (std_uni_bootstrap){}
5951 else
5952 {
5953 
5954 // helper for looking up code point sets
5955 ptrdiff_t findUnicodeSet(alias table, C)(const scope C[] name)
5956 {
5957     import std.algorithm.iteration : map;
5958     import std.range : assumeSorted;
5959     auto range = assumeSorted!((a,b) => propertyNameLess(a,b))
5960         (table.map!"a.name"());
5961     size_t idx = range.lowerBound(name).length;
5962     if (idx < range.length && comparePropertyName(range[idx], name) == 0)
5963         return idx;
5964     return -1;
5965 }
5966 
5967 // another one that loads it
5968 bool loadUnicodeSet(alias table, Set, C)(const scope C[] name, ref Set dest)
5969 {
5970     auto idx = findUnicodeSet!table(name);
5971     if (idx >= 0)
5972     {
5973         dest = Set(asSet(table[idx].compressed));
5974         return true;
5975     }
5976     return false;
5977 }
5978 
5979 bool loadProperty(Set=CodepointSet, C)
5980     (const scope C[] name, ref Set target) pure
5981 {
5982     import std.internal.unicode_tables : uniProps; // generated file
5983     alias ucmp = comparePropertyName;
5984     // conjure cumulative properties by hand
5985     if (ucmp(name, "L") == 0 || ucmp(name, "Letter") == 0)
5986     {
5987         target = asSet(uniProps.Lu);
5988         target |= asSet(uniProps.Ll);
5989         target |= asSet(uniProps.Lt);
5990         target |= asSet(uniProps.Lo);
5991         target |= asSet(uniProps.Lm);
5992     }
5993     else if (ucmp(name,"LC") == 0 || ucmp(name,"Cased Letter")==0)
5994     {
5995         target = asSet(uniProps.Ll);
5996         target |= asSet(uniProps.Lu);
5997         target |= asSet(uniProps.Lt);// Title case
5998     }
5999     else if (ucmp(name, "M") == 0 || ucmp(name, "Mark") == 0)
6000     {
6001         target = asSet(uniProps.Mn);
6002         target |= asSet(uniProps.Mc);
6003         target |= asSet(uniProps.Me);
6004     }
6005     else if (ucmp(name, "N") == 0 || ucmp(name, "Number") == 0)
6006     {
6007         target = asSet(uniProps.Nd);
6008         target |= asSet(uniProps.Nl);
6009         target |= asSet(uniProps.No);
6010     }
6011     else if (ucmp(name, "P") == 0 || ucmp(name, "Punctuation") == 0)
6012     {
6013         target = asSet(uniProps.Pc);
6014         target |= asSet(uniProps.Pd);
6015         target |= asSet(uniProps.Ps);
6016         target |= asSet(uniProps.Pe);
6017         target |= asSet(uniProps.Pi);
6018         target |= asSet(uniProps.Pf);
6019         target |= asSet(uniProps.Po);
6020     }
6021     else if (ucmp(name, "S") == 0 || ucmp(name, "Symbol") == 0)
6022     {
6023         target = asSet(uniProps.Sm);
6024         target |= asSet(uniProps.Sc);
6025         target |= asSet(uniProps.Sk);
6026         target |= asSet(uniProps.So);
6027     }
6028     else if (ucmp(name, "Z") == 0 || ucmp(name, "Separator") == 0)
6029     {
6030         target = asSet(uniProps.Zs);
6031         target |= asSet(uniProps.Zl);
6032         target |= asSet(uniProps.Zp);
6033     }
6034     else if (ucmp(name, "C") == 0 || ucmp(name, "Other") == 0)
6035     {
6036         target = asSet(uniProps.Cc);
6037         target |= asSet(uniProps.Cf);
6038         target |= asSet(uniProps.Cs);
6039         target |= asSet(uniProps.Co);
6040         target |= asSet(uniProps.Cn);
6041     }
6042     else if (ucmp(name, "graphical") == 0)
6043     {
6044         target = asSet(uniProps.Alphabetic);
6045 
6046         target |= asSet(uniProps.Mn);
6047         target |= asSet(uniProps.Mc);
6048         target |= asSet(uniProps.Me);
6049 
6050         target |= asSet(uniProps.Nd);
6051         target |= asSet(uniProps.Nl);
6052         target |= asSet(uniProps.No);
6053 
6054         target |= asSet(uniProps.Pc);
6055         target |= asSet(uniProps.Pd);
6056         target |= asSet(uniProps.Ps);
6057         target |= asSet(uniProps.Pe);
6058         target |= asSet(uniProps.Pi);
6059         target |= asSet(uniProps.Pf);
6060         target |= asSet(uniProps.Po);
6061 
6062         target |= asSet(uniProps.Zs);
6063 
6064         target |= asSet(uniProps.Sm);
6065         target |= asSet(uniProps.Sc);
6066         target |= asSet(uniProps.Sk);
6067         target |= asSet(uniProps.So);
6068     }
6069     else if (ucmp(name, "any") == 0)
6070         target = Set.fromIntervals(0, 0x110000);
6071     else if (ucmp(name, "ascii") == 0)
6072         target = Set.fromIntervals(0, 0x80);
6073     else
6074         return loadUnicodeSet!(uniProps.tab)(name, target);
6075     return true;
6076 }
6077 
6078 // CTFE-only helper for checking property names at compile-time
6079 @safe bool isPrettyPropertyName(C)(const scope C[] name)
6080 {
6081     import std.algorithm.searching : find;
6082     auto names = [
6083         "L", "Letter",
6084         "LC", "Cased Letter",
6085         "M", "Mark",
6086         "N", "Number",
6087         "P", "Punctuation",
6088         "S", "Symbol",
6089         "Z", "Separator",
6090         "Graphical",
6091         "any",
6092         "ascii"
6093     ];
6094     auto x = find!(x => comparePropertyName(x, name) == 0)(names);
6095     return !x.empty;
6096 }
6097 
6098 // ditto, CTFE-only, not optimized
6099 @safe private static bool findSetName(alias table, C)(const scope C[] name)
6100 {
6101     return findUnicodeSet!table(name) >= 0;
6102 }
6103 
6104 template SetSearcher(alias table, string kind)
6105 {
6106     /// Run-time checked search.
6107     static auto opCall(C)(const scope C[] name)
6108         if (is(C : dchar))
6109     {
6110         import std.conv : to;
6111         CodepointSet set;
6112         if (loadUnicodeSet!table(name, set))
6113             return set;
6114         throw new Exception("No unicode set for "~kind~" by name "
6115             ~name.to!string()~" was found.");
6116     }
6117     /// Compile-time checked search.
6118     static @property auto opDispatch(string name)()
6119     {
6120         static if (findSetName!table(name))
6121         {
6122             CodepointSet set;
6123             loadUnicodeSet!table(name, set);
6124             return set;
6125         }
6126         else
6127             static assert(false, "No unicode set for "~kind~" by name "
6128                 ~name~" was found.");
6129     }
6130 }
6131 
6132 // Characters that need escaping in string posed as regular expressions
6133 package(std) alias Escapables = AliasSeq!('[', ']', '\\', '^', '$', '.', '|', '?', ',', '-',
6134     ';', ':', '#', '&', '%', '/', '<', '>', '`',  '*', '+', '(', ')', '{', '}',  '~');
6135 
6136 package(std) CodepointSet memoizeExpr(string expr)()
6137 {
6138     if (__ctfe)
6139         return mixin(expr);
6140     alias T = typeof(mixin(expr));
6141     static T slot;
6142     static bool initialized;
6143     if (!initialized)
6144     {
6145         slot =  mixin(expr);
6146         initialized = true;
6147     }
6148     return slot;
6149 }
6150 
6151 //property for \w character class
6152 package(std) @property CodepointSet wordCharacter() @safe
6153 {
6154     return memoizeExpr!("unicode.Alphabetic | unicode.Mn | unicode.Mc
6155         | unicode.Me | unicode.Nd | unicode.Pc")();
6156 }
6157 
6158 //basic stack, just in case it gets used anywhere else then Parser
6159 package(std) struct Stack(T)
6160 {
6161 @safe:
6162     T[] data;
6163     @property bool empty(){ return data.empty; }
6164 
6165     @property size_t length(){ return data.length; }
6166 
6167     void push(T val){ data ~= val;  }
6168 
6169     @trusted T pop()
6170     {
6171         assert(!empty);
6172         auto val = data[$ - 1];
6173         data = data[0 .. $ - 1];
6174         if (!__ctfe)
6175             cast(void) data.assumeSafeAppend();
6176         return val;
6177     }
6178 
6179     @property ref T top()
6180     {
6181         assert(!empty);
6182         return data[$ - 1];
6183     }
6184 }
6185 
6186 //test if a given string starts with hex number of maxDigit that's a valid codepoint
6187 //returns it's value and skips these maxDigit chars on success, throws on failure
6188 package(std) dchar parseUniHex(Range)(ref Range str, size_t maxDigit)
6189 {
6190     import std.exception : enforce;
6191     //std.conv.parse is both @system and bogus
6192     uint val;
6193     for (int k = 0; k < maxDigit; k++)
6194     {
6195         enforce(!str.empty, "incomplete escape sequence");
6196         //accepts ascii only, so it's OK to index directly
6197         immutable current = str.front;
6198         if ('0' <= current && current <= '9')
6199             val = val * 16 + current - '0';
6200         else if ('a' <= current && current <= 'f')
6201             val = val * 16 + current -'a' + 10;
6202         else if ('A' <= current && current <= 'F')
6203             val = val * 16 + current - 'A' + 10;
6204         else
6205             throw new Exception("invalid escape sequence");
6206         str.popFront();
6207     }
6208     enforce(val <= 0x10FFFF, "invalid codepoint");
6209     return val;
6210 }
6211 
6212 @safe unittest
6213 {
6214     import std.algorithm.searching : canFind;
6215     import std.exception : collectException;
6216     string[] non_hex = [ "000j", "000z", "FffG", "0Z"];
6217     string[] hex = [ "01", "ff", "00af", "10FFFF" ];
6218     int[] value = [ 1, 0xFF, 0xAF, 0x10FFFF ];
6219     foreach (v; non_hex)
6220         assert(collectException(parseUniHex(v, v.length)).msg
6221           .canFind("invalid escape sequence"));
6222     foreach (i, v; hex)
6223         assert(parseUniHex(v, v.length) == value[i]);
6224     string over = "0011FFFF";
6225     assert(collectException(parseUniHex(over, over.length)).msg
6226       .canFind("invalid codepoint"));
6227 }
6228 
6229 auto caseEnclose(CodepointSet set)
6230 {
6231     auto cased = set & unicode.LC;
6232     foreach (dchar ch; cased.byCodepoint)
6233     {
6234         foreach (c; simpleCaseFoldings(ch))
6235             set |= c;
6236     }
6237     return set;
6238 }
6239 
6240 /+
6241     fetch codepoint set corresponding to a name (InBlock or binary property)
6242 +/
6243 CodepointSet getUnicodeSet(const scope char[] name, bool negated,  bool casefold) @safe
6244 {
6245     CodepointSet s = unicode(name);
6246     //FIXME: caseEnclose for new uni as Set | CaseEnclose(SET && LC)
6247     if (casefold)
6248        s = caseEnclose(s);
6249     if (negated)
6250         s = s.inverted;
6251     return s;
6252 }
6253 
6254 struct UnicodeSetParser(Range)
6255 {
6256     import std.exception : enforce;
6257     import std.typecons : tuple, Tuple;
6258     Range range;
6259     bool casefold_;
6260 
6261     @property bool empty(){ return range.empty; }
6262     @property dchar front(){ return range.front; }
6263     void popFront(){ range.popFront(); }
6264 
6265     //CodepointSet operations relatively in order of priority
6266     enum Operator:uint {
6267         Open = 0, Negate,  Difference, SymDifference, Intersection, Union, None
6268     }
6269 
6270     //parse unit of CodepointSet spec, most notably escape sequences and char ranges
6271     //also fetches next set operation
6272     Tuple!(CodepointSet,Operator) parseCharTerm()
6273     {
6274         import std.range : drop;
6275         enum privateUseStart = '\U000F0000', privateUseEnd ='\U000FFFFD';
6276         enum State{ Start, Char, Escape, CharDash, CharDashEscape,
6277             PotentialTwinSymbolOperator }
6278         Operator op = Operator.None;
6279         dchar last;
6280         CodepointSet set;
6281         State state = State.Start;
6282 
6283         void addWithFlags(ref CodepointSet set, uint ch)
6284         {
6285             if (casefold_)
6286             {
6287                 auto foldings = simpleCaseFoldings(ch);
6288                 foreach (v; foldings)
6289                     set |= v;
6290             }
6291             else
6292                 set |= ch;
6293         }
6294 
6295         static Operator twinSymbolOperator(dchar symbol)
6296         {
6297             switch (symbol)
6298             {
6299             case '|':
6300                 return Operator.Union;
6301             case '-':
6302                 return Operator.Difference;
6303             case '~':
6304                 return Operator.SymDifference;
6305             case '&':
6306                 return Operator.Intersection;
6307             default:
6308                 assert(false);
6309             }
6310         }
6311 
6312         L_CharTermLoop:
6313         for (;;)
6314         {
6315             final switch (state)
6316             {
6317             case State.Start:
6318                 switch (front)
6319                 {
6320                 case '|':
6321                 case '-':
6322                 case '~':
6323                 case '&':
6324                     state = State.PotentialTwinSymbolOperator;
6325                     last = front;
6326                     break;
6327                 case '[':
6328                     op = Operator.Union;
6329                     goto case;
6330                 case ']':
6331                     break L_CharTermLoop;
6332                 case '\\':
6333                     state = State.Escape;
6334                     break;
6335                 default:
6336                     state = State.Char;
6337                     last = front;
6338                 }
6339                 break;
6340             case State.Char:
6341                 // xxx last front xxx
6342                 switch (front)
6343                 {
6344                 case '|':
6345                 case '~':
6346                 case '&':
6347                     // then last is treated as normal char and added as implicit union
6348                     state = State.PotentialTwinSymbolOperator;
6349                     addWithFlags(set, last);
6350                     last = front;
6351                     break;
6352                 case '-': // still need more info
6353                     state = State.CharDash;
6354                     break;
6355                 case '\\':
6356                     set |= last;
6357                     state = State.Escape;
6358                     break;
6359                 case '[':
6360                     op = Operator.Union;
6361                     goto case;
6362                 case ']':
6363                     addWithFlags(set, last);
6364                     break L_CharTermLoop;
6365                 default:
6366                     state = State.Char;
6367                     addWithFlags(set, last);
6368                     last = front;
6369                 }
6370                 break;
6371             case State.PotentialTwinSymbolOperator:
6372                 // xxx last front xxxx
6373                 // where last = [|-&~]
6374                 if (front == last)
6375                 {
6376                     op = twinSymbolOperator(last);
6377                     popFront();//skip second twin char
6378                     break L_CharTermLoop;
6379                 }
6380                 goto case State.Char;
6381             case State.Escape:
6382                 // xxx \ front xxx
6383                 switch (front)
6384                 {
6385                 case 'f':
6386                     last = '\f';
6387                     state = State.Char;
6388                     break;
6389                 case 'n':
6390                     last = '\n';
6391                     state = State.Char;
6392                     break;
6393                 case 'r':
6394                     last = '\r';
6395                     state = State.Char;
6396                     break;
6397                 case 't':
6398                     last = '\t';
6399                     state = State.Char;
6400                     break;
6401                 case 'v':
6402                     last = '\v';
6403                     state = State.Char;
6404                     break;
6405                 case 'c':
6406                     last = unicode.parseControlCode(this);
6407                     state = State.Char;
6408                     break;
6409                 foreach (val; Escapables)
6410                 {
6411                 case val:
6412                 }
6413                     last = front;
6414                     state = State.Char;
6415                     break;
6416                 case 'p':
6417                     set.add(unicode.parsePropertySpec(this, false, casefold_));
6418                     state = State.Start;
6419                     continue L_CharTermLoop; //next char already fetched
6420                 case 'P':
6421                     set.add(unicode.parsePropertySpec(this, true, casefold_));
6422                     state = State.Start;
6423                     continue L_CharTermLoop; //next char already fetched
6424                 case 'x':
6425                     popFront();
6426                     last = parseUniHex(this, 2);
6427                     state = State.Char;
6428                     continue L_CharTermLoop;
6429                 case 'u':
6430                     popFront();
6431                     last = parseUniHex(this, 4);
6432                     state = State.Char;
6433                     continue L_CharTermLoop;
6434                 case 'U':
6435                     popFront();
6436                     last = parseUniHex(this, 8);
6437                     state = State.Char;
6438                     continue L_CharTermLoop;
6439                 case 'd':
6440                     set.add(unicode.Nd);
6441                     state = State.Start;
6442                     break;
6443                 case 'D':
6444                     set.add(unicode.Nd.inverted);
6445                     state = State.Start;
6446                     break;
6447                 case 's':
6448                     set.add(unicode.White_Space);
6449                     state = State.Start;
6450                     break;
6451                 case 'S':
6452                     set.add(unicode.White_Space.inverted);
6453                     state = State.Start;
6454                     break;
6455                 case 'w':
6456                     set.add(wordCharacter);
6457                     state = State.Start;
6458                     break;
6459                 case 'W':
6460                     set.add(wordCharacter.inverted);
6461                     state = State.Start;
6462                     break;
6463                 default:
6464                     if (front >= privateUseStart && front <= privateUseEnd)
6465                         enforce(false, "no matching ']' found while parsing character class");
6466                     enforce(false, "invalid escape sequence");
6467                 }
6468                 break;
6469             case State.CharDash:
6470                 // xxx last - front xxx
6471                 switch (front)
6472                 {
6473                 case '[':
6474                     op = Operator.Union;
6475                     goto case;
6476                 case ']':
6477                     //means dash is a single char not an interval specifier
6478                     addWithFlags(set, last);
6479                     addWithFlags(set, '-');
6480                     break L_CharTermLoop;
6481                  case '-'://set Difference again
6482                     addWithFlags(set, last);
6483                     op = Operator.Difference;
6484                     popFront();//skip '-'
6485                     break L_CharTermLoop;
6486                 case '\\':
6487                     state = State.CharDashEscape;
6488                     break;
6489                 default:
6490                     enforce(last <= front, "inverted range");
6491                     if (casefold_)
6492                     {
6493                         for (uint ch = last; ch <= front; ch++)
6494                             addWithFlags(set, ch);
6495                     }
6496                     else
6497                         set.add(last, front + 1);
6498                     state = State.Start;
6499                 }
6500                 break;
6501             case State.CharDashEscape:
6502             //xxx last - \ front xxx
6503                 uint end;
6504                 switch (front)
6505                 {
6506                 case 'f':
6507                     end = '\f';
6508                     break;
6509                 case 'n':
6510                     end = '\n';
6511                     break;
6512                 case 'r':
6513                     end = '\r';
6514                     break;
6515                 case 't':
6516                     end = '\t';
6517                     break;
6518                 case 'v':
6519                     end = '\v';
6520                     break;
6521                 foreach (val; Escapables)
6522                 {
6523                 case val:
6524                 }
6525                     end = front;
6526                     break;
6527                 case 'c':
6528                     end = unicode.parseControlCode(this);
6529                     break;
6530                 case 'x':
6531                     popFront();
6532                     end = parseUniHex(this, 2);
6533                     enforce(last <= end,"inverted range");
6534                     set.add(last, end + 1);
6535                     state = State.Start;
6536                     continue L_CharTermLoop;
6537                 case 'u':
6538                     popFront();
6539                     end = parseUniHex(this, 4);
6540                     enforce(last <= end,"inverted range");
6541                     set.add(last, end + 1);
6542                     state = State.Start;
6543                     continue L_CharTermLoop;
6544                 case 'U':
6545                     popFront();
6546                     end = parseUniHex(this, 8);
6547                     enforce(last <= end,"inverted range");
6548                     set.add(last, end + 1);
6549                     state = State.Start;
6550                     continue L_CharTermLoop;
6551                 default:
6552                     if (front >= privateUseStart && front <= privateUseEnd)
6553                         enforce(false, "no matching ']' found while parsing character class");
6554                     enforce(false, "invalid escape sequence");
6555                 }
6556                 // Lookahead to check if it's a \T
6557                 // where T is sub-pattern terminator in multi-pattern scheme
6558                 auto lookahead = range.save.drop(1);
6559                 if (end == '\\' && !lookahead.empty)
6560                 {
6561                     if (lookahead.front >= privateUseStart && lookahead.front <= privateUseEnd)
6562                         enforce(false, "no matching ']' found while parsing character class");
6563                 }
6564                 enforce(last <= end,"inverted range");
6565                 set.add(last, end + 1);
6566                 state = State.Start;
6567                 break;
6568             }
6569             popFront();
6570             enforce(!empty, "unexpected end of CodepointSet");
6571         }
6572         return tuple(set, op);
6573     }
6574 
6575     alias ValStack = Stack!(CodepointSet);
6576     alias OpStack = Stack!(Operator);
6577 
6578     CodepointSet parseSet()
6579     {
6580         ValStack vstack;
6581         OpStack opstack;
6582         import std.functional : unaryFun;
6583         enforce(!empty, "unexpected end of input");
6584         enforce(front == '[', "expected '[' at the start of unicode set");
6585         //
6586         static bool apply(Operator op, ref ValStack stack)
6587         {
6588             switch (op)
6589             {
6590             case Operator.Negate:
6591                 enforce(!stack.empty, "no operand for '^'");
6592                 stack.top = stack.top.inverted;
6593                 break;
6594             case Operator.Union:
6595                 auto s = stack.pop();//2nd operand
6596                 enforce(!stack.empty, "no operand for '||'");
6597                 stack.top.add(s);
6598                 break;
6599             case Operator.Difference:
6600                 auto s = stack.pop();//2nd operand
6601                 enforce(!stack.empty, "no operand for '--'");
6602                 stack.top.sub(s);
6603                 break;
6604             case Operator.SymDifference:
6605                 auto s = stack.pop();//2nd operand
6606                 enforce(!stack.empty, "no operand for '~~'");
6607                 stack.top ~= s;
6608                 break;
6609             case Operator.Intersection:
6610                 auto s = stack.pop();//2nd operand
6611                 enforce(!stack.empty, "no operand for '&&'");
6612                 stack.top.intersect(s);
6613                 break;
6614             default:
6615                 return false;
6616             }
6617             return true;
6618         }
6619         static bool unrollWhile(alias cond)(ref ValStack vstack, ref OpStack opstack)
6620         {
6621             while (cond(opstack.top))
6622             {
6623                 if (!apply(opstack.pop(),vstack))
6624                     return false;//syntax error
6625                 if (opstack.empty)
6626                     return false;
6627             }
6628             return true;
6629         }
6630 
6631         L_CharsetLoop:
6632         do
6633         {
6634             switch (front)
6635             {
6636             case '[':
6637                 opstack.push(Operator.Open);
6638                 popFront();
6639                 enforce(!empty, "unexpected end of character class");
6640                 if (front == '^')
6641                 {
6642                     opstack.push(Operator.Negate);
6643                     popFront();
6644                     enforce(!empty, "unexpected end of character class");
6645                 }
6646                 else if (front == ']') // []...] is special cased
6647                 {
6648                     popFront();
6649                     enforce(!empty, "wrong character set");
6650                     auto pair = parseCharTerm();
6651                     pair[0].add(']', ']'+1);
6652                     if (pair[1] != Operator.None)
6653                     {
6654                         if (opstack.top == Operator.Union)
6655                             unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6656                         opstack.push(pair[1]);
6657                     }
6658                     vstack.push(pair[0]);
6659                 }
6660                 break;
6661             case ']':
6662                 enforce(unrollWhile!(unaryFun!"a != a.Open")(vstack, opstack),
6663                     "character class syntax error");
6664                 enforce(!opstack.empty, "unmatched ']'");
6665                 opstack.pop();
6666                 popFront();
6667                 if (opstack.empty)
6668                     break L_CharsetLoop;
6669                 auto pair  = parseCharTerm();
6670                 if (!pair[0].empty)//not only operator e.g. -- or ~~
6671                 {
6672                     vstack.top.add(pair[0]);//apply union
6673                 }
6674                 if (pair[1] != Operator.None)
6675                 {
6676                     if (opstack.top == Operator.Union)
6677                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6678                     opstack.push(pair[1]);
6679                 }
6680                 break;
6681             //
6682             default://yet another pair of term(op)?
6683                 auto pair = parseCharTerm();
6684                 if (pair[1] != Operator.None)
6685                 {
6686                     if (opstack.top == Operator.Union)
6687                         unrollWhile!(unaryFun!"a == a.Union")(vstack, opstack);
6688                     opstack.push(pair[1]);
6689                 }
6690                 vstack.push(pair[0]);
6691             }
6692 
6693         }while (!empty || !opstack.empty);
6694         while (!opstack.empty)
6695             apply(opstack.pop(),vstack);
6696         assert(vstack.length == 1);
6697         return vstack.top;
6698     }
6699 }
6700 
6701 /**
6702     A single entry point to lookup Unicode $(CODEPOINT) sets by name or alias of
6703     a block, script or general category.
6704 
6705     It uses well defined standard rules of property name lookup.
6706     This includes fuzzy matching of names, so that
6707     'White_Space', 'white-SpAce' and 'whitespace' are all considered equal
6708     and yield the same set of white space $(CHARACTERS).
6709 */
6710 @safe public struct unicode
6711 {
6712     import std.exception : enforce;
6713     /**
6714         Performs the lookup of set of $(CODEPOINTS)
6715         with compile-time correctness checking.
6716         This short-cut version combines 3 searches:
6717         across blocks, scripts, and common binary properties.
6718 
6719         Note that since scripts and blocks overlap the
6720         usual trick to disambiguate is used - to get a block use
6721         `unicode.InBlockName`, to search a script
6722         use `unicode.ScriptName`.
6723 
6724         See_Also: $(LREF block), $(LREF script)
6725         and (not included in this search) $(LREF hangulSyllableType).
6726     */
6727 
6728     static @property auto opDispatch(string name)() pure
6729     {
6730         static if (findAny(name))
6731             return loadAny(name);
6732         else
6733             static assert(false, "No unicode set by name "~name~" was found.");
6734     }
6735 
6736     ///
6737     @safe unittest
6738     {
6739         import std.exception : collectException;
6740         auto ascii = unicode.ASCII;
6741         assert(ascii['A']);
6742         assert(ascii['~']);
6743         assert(!ascii['\u00e0']);
6744         // matching is case-insensitive
6745         assert(ascii == unicode.ascII);
6746         assert(!ascii['à']);
6747         // underscores, '-' and whitespace in names are ignored too
6748         auto latin = unicode.in_latin1_Supplement;
6749         assert(latin['à']);
6750         assert(!latin['$']);
6751         // BTW Latin 1 Supplement is a block, hence "In" prefix
6752         assert(latin == unicode("In Latin 1 Supplement"));
6753         // run-time look up throws if no such set is found
6754         assert(collectException(unicode("InCyrilliac")));
6755     }
6756 
6757     /**
6758         The same lookup across blocks, scripts, or binary properties,
6759         but performed at run-time.
6760         This version is provided for cases where `name`
6761         is not known beforehand; otherwise compile-time
6762         checked $(LREF opDispatch) is typically a better choice.
6763 
6764         See the $(S_LINK Unicode properties, table of properties) for available
6765         sets.
6766     */
6767     static auto opCall(C)(const scope C[] name)
6768         if (is(C : dchar))
6769     {
6770         return loadAny(name);
6771     }
6772 
6773     /**
6774         Narrows down the search for sets of $(CODEPOINTS) to all Unicode blocks.
6775 
6776         Note:
6777         Here block names are unambiguous as no scripts are searched
6778         and thus to search use simply `unicode.block.BlockName` notation.
6779 
6780         See $(S_LINK Unicode properties, table of properties) for available sets.
6781         See_Also: $(S_LINK Unicode properties, table of properties).
6782     */
6783     struct block
6784     {
6785         import std.internal.unicode_tables : blocks; // generated file
6786         mixin SetSearcher!(blocks.tab, "block");
6787     }
6788 
6789     ///
6790     @safe unittest
6791     {
6792         // use .block for explicitness
6793         assert(unicode.block.Greek_and_Coptic == unicode.InGreek_and_Coptic);
6794     }
6795 
6796     /**
6797         Narrows down the search for sets of $(CODEPOINTS) to all Unicode scripts.
6798 
6799         See the $(S_LINK Unicode properties, table of properties) for available
6800         sets.
6801     */
6802     struct script
6803     {
6804         import std.internal.unicode_tables : scripts; // generated file
6805         mixin SetSearcher!(scripts.tab, "script");
6806     }
6807 
6808     ///
6809     @safe unittest
6810     {
6811         auto arabicScript = unicode.script.arabic;
6812         auto arabicBlock = unicode.block.arabic;
6813         // there is an intersection between script and block
6814         assert(arabicBlock['؁']);
6815         assert(arabicScript['؁']);
6816         // but they are different
6817         assert(arabicBlock != arabicScript);
6818         assert(arabicBlock == unicode.inArabic);
6819         assert(arabicScript == unicode.arabic);
6820     }
6821 
6822     /**
6823         Fetch a set of $(CODEPOINTS) that have the given hangul syllable type.
6824 
6825         Other non-binary properties (once supported) follow the same
6826         notation - `unicode.propertyName.propertyValue` for compile-time
6827         checked access and `unicode.propertyName(propertyValue)`
6828         for run-time checked one.
6829 
6830         See the $(S_LINK Unicode properties, table of properties) for available
6831         sets.
6832     */
6833     struct hangulSyllableType
6834     {
6835         import std.internal.unicode_tables : hangul; // generated file
6836         mixin SetSearcher!(hangul.tab, "hangul syllable type");
6837     }
6838 
6839     ///
6840     @safe unittest
6841     {
6842         // L here is syllable type not Letter as in unicode.L short-cut
6843         auto leadingVowel = unicode.hangulSyllableType("L");
6844         // check that some leading vowels are present
6845         foreach (vowel; '\u1110'..'\u115F')
6846             assert(leadingVowel[vowel]);
6847         assert(leadingVowel == unicode.hangulSyllableType.L);
6848     }
6849 
6850     //parse control code of form \cXXX, c assumed to be the current symbol
6851     static package(std) dchar parseControlCode(Parser)(ref Parser p)
6852     {
6853         with(p)
6854         {
6855             popFront();
6856             enforce(!empty, "Unfinished escape sequence");
6857             enforce(('a' <= front && front <= 'z')
6858                 || ('A' <= front && front <= 'Z'),
6859             "Only letters are allowed after \\c");
6860             return front & 0x1f;
6861         }
6862     }
6863 
6864     //parse and return a CodepointSet for \p{...Property...} and \P{...Property..},
6865     //\ - assumed to be processed, p - is current
6866     static package(std) CodepointSet parsePropertySpec(Range)(ref Range p,
6867         bool negated, bool casefold)
6868     {
6869         static import std.ascii;
6870         with(p)
6871         {
6872             enum MAX_PROPERTY = 128;
6873             char[MAX_PROPERTY] result;
6874             uint k = 0;
6875             popFront();
6876             enforce(!empty, "eof parsing unicode property spec");
6877             if (front == '{')
6878             {
6879                 popFront();
6880                 while (k < MAX_PROPERTY && !empty && front !='}'
6881                     && front !=':')
6882                 {
6883                     if (front != '-' && front != ' ' && front != '_')
6884                         result[k++] = cast(char) std.ascii.toLower(front);
6885                     popFront();
6886                 }
6887                 enforce(k != MAX_PROPERTY, "invalid property name");
6888                 enforce(front == '}', "} expected ");
6889             }
6890             else
6891             {//single char properties e.g.: \pL, \pN ...
6892                 enforce(front < 0x80, "invalid property name");
6893                 result[k++] = cast(char) front;
6894             }
6895             auto s = getUnicodeSet(result[0 .. k], negated, casefold);
6896             enforce(!s.empty, "unrecognized unicode property spec");
6897             popFront();
6898             return s;
6899         }
6900     }
6901 
6902     /**
6903         Parse unicode codepoint set from given `range` using standard regex
6904         syntax '[...]'. The range is advanced skiping over regex set definition.
6905         `casefold` parameter determines if the set should be casefolded - that is
6906         include both lower and upper case versions for any letters in the set.
6907     */
6908     static CodepointSet parseSet(Range)(ref Range range, bool casefold=false)
6909     if (isInputRange!Range && is(ElementType!Range : dchar))
6910     {
6911         auto usParser = UnicodeSetParser!Range(range, casefold);
6912         auto set = usParser.parseSet();
6913         range = usParser.range;
6914         return set;
6915     }
6916 
6917     ///
6918     @safe unittest
6919     {
6920         import std.uni : unicode;
6921         string pat = "[a-zA-Z0-9]hello";
6922         auto set = unicode.parseSet(pat);
6923         // check some of the codepoints
6924         assert(set['a'] && set['A'] && set['9']);
6925         assert(pat == "hello");
6926     }
6927 
6928 private:
6929     alias ucmp = comparePropertyName;
6930 
6931     static bool findAny(string name)
6932     {
6933         import std.internal.unicode_tables : blocks, scripts, uniProps; // generated file
6934         return isPrettyPropertyName(name)
6935             || findSetName!(uniProps.tab)(name) || findSetName!(scripts.tab)(name)
6936             || (ucmp(name[0 .. 2],"In") == 0 && findSetName!(blocks.tab)(name[2..$]));
6937     }
6938 
6939     static auto loadAny(Set=CodepointSet, C)(const scope C[] name) pure
6940     {
6941         import std.conv : to;
6942         import std.internal.unicode_tables : blocks, scripts; // generated file
6943         Set set;
6944         immutable loaded = loadProperty(name, set) || loadUnicodeSet!(scripts.tab)(name, set)
6945             || (name.length > 2 && ucmp(name[0 .. 2],"In") == 0
6946                 && loadUnicodeSet!(blocks.tab)(name[2..$], set));
6947         if (loaded)
6948             return set;
6949         throw new Exception("No unicode set by name "~name.to!string()~" was found.");
6950     }
6951 
6952     // FIXME: re-disable once the compiler is fixed
6953     // Disabled to prevent the mistake of creating instances of this pseudo-struct.
6954     //@disable ~this();
6955 }
6956 
6957 @safe unittest
6958 {
6959     import std.internal.unicode_tables : blocks, uniProps; // generated file
6960     assert(unicode("InHebrew") == asSet(blocks.Hebrew));
6961     assert(unicode("separator") == (asSet(uniProps.Zs) | asSet(uniProps.Zl) | asSet(uniProps.Zp)));
6962     assert(unicode("In-Kharoshthi") == asSet(blocks.Kharoshthi));
6963 }
6964 
6965 enum EMPTY_CASE_TRIE = ushort.max;// from what gen_uni uses internally
6966 
6967 // TODO: redo the most of hangul stuff algorithmically in case of Graphemes too
6968 // Use combined trie instead of checking for '\r' | '\n' | ccTrie,
6969 //   or extend | '\u200D' separately
6970 
6971 private static bool isRegionalIndicator(dchar ch) @safe pure @nogc nothrow
6972 {
6973     return ch >= '\U0001F1E6' && ch <= '\U0001F1FF';
6974 }
6975 
6976 // Our grapheme decoder is a state machine, this is list of all possible
6977 // states before each code point.
6978 private enum GraphemeState
6979 {
6980     Start,
6981     CR,
6982     RI,
6983     L,
6984     V,
6985     LVT,
6986     Emoji,
6987     EmojiZWJ,
6988     Prepend,
6989     End
6990 }
6991 
6992 // Message values whether end of grapheme is reached
6993 private enum TransformRes
6994 {
6995     // No, unless the source range ends here
6996     // (GB2 - break at end of text, unless text is empty)
6997     goOn,
6998     redo, // Run last character again with new state
6999     retInclude, // Yes, after the just iterated character
7000     retExclude // Yes, before the just iterated character
7001 }
7002 
7003 // The logic of the grapheme decoding is all here
7004 // GB# means Grapheme Breaking rule number # - see Unicode standard annex #29
7005 // Note, getting GB1 (break at start of text, unless text is empty) right
7006 // relies on the user starting grapheme walking from beginning of the text, and
7007 // not attempting to walk an empty text.
7008 private enum TransformRes
7009     function(ref GraphemeState, dchar) @safe pure nothrow @nogc [] graphemeTransforms =
7010 [
7011     GraphemeState.Start: (ref state, ch)
7012     {
7013         // GB4. Break after controls.
7014         if (graphemeControlTrie[ch] || ch == '\n')
7015             return TransformRes.retInclude;
7016 
7017         with (GraphemeState) state =
7018             ch == '\r' ? CR :
7019             isRegionalIndicator(ch) ? RI :
7020             isHangL(ch) ? L :
7021             hangLV[ch] || isHangV(ch) ? V :
7022             hangLVT[ch] || isHangT(ch) ? LVT :
7023             prependTrie[ch] ? Prepend :
7024             xpictoTrie[ch] ? Emoji :
7025             End;
7026 
7027         // No matter what we encountered, we always include the
7028         // first code point in the grapheme.
7029         return TransformRes.goOn;
7030     },
7031 
7032     // GB3, GB4. Do not break between a CR and LF.
7033     // Otherwise, break after controls.
7034     GraphemeState.CR: (ref state, ch) => ch == '\n' ?
7035         TransformRes.retInclude :
7036         TransformRes.retExclude,
7037 
7038     // GB12 - GB13. Do not break within emoji flag sequences.
7039     // That is, do not break between regional indicator (RI) symbols if
7040     // there is an odd number of RI characters before the break point.
7041     // This state applies if one and only one RI code point has been
7042     // encountered.
7043     GraphemeState.RI: (ref state, ch)
7044     {
7045         state = GraphemeState.End;
7046 
7047         return isRegionalIndicator(ch) ?
7048             TransformRes.goOn :
7049             TransformRes.redo;
7050     },
7051 
7052     // GB6. Do not break Hangul syllable sequences.
7053     GraphemeState.L: (ref state, ch)
7054     {
7055         if (isHangL(ch))
7056             return TransformRes.goOn;
7057         else if (isHangV(ch) || hangLV[ch])
7058         {
7059             state = GraphemeState.V;
7060             return TransformRes.goOn;
7061         }
7062         else if (hangLVT[ch])
7063         {
7064             state = GraphemeState.LVT;
7065             return TransformRes.goOn;
7066         }
7067 
7068         state = GraphemeState.End;
7069         return TransformRes.redo;
7070     },
7071 
7072     // GB7. Do not break Hangul syllable sequences.
7073     GraphemeState.V: (ref state, ch)
7074     {
7075         if (isHangV(ch))
7076             return TransformRes.goOn;
7077         else if (isHangT(ch))
7078         {
7079             state = GraphemeState.LVT;
7080             return TransformRes.goOn;
7081         }
7082 
7083         state = GraphemeState.End;
7084         return TransformRes.redo;
7085     },
7086 
7087     // GB8. Do not break Hangul syllable sequences.
7088     GraphemeState.LVT: (ref state, ch)
7089     {
7090         if (isHangT(ch))
7091             return TransformRes.goOn;
7092 
7093         state = GraphemeState.End;
7094         return TransformRes.redo;
7095     },
7096 
7097     // GB11. Do not break within emoji modifier sequences or emoji
7098     // zwj sequences. This state applies when the last code point was
7099     // NOT a ZWJ.
7100     GraphemeState.Emoji: (ref state, ch)
7101     {
7102         if (graphemeExtendTrie[ch])
7103             return TransformRes.goOn;
7104 
7105         static assert(!graphemeExtendTrie['\u200D']);
7106 
7107         if (ch == '\u200D')
7108         {
7109             state = GraphemeState.EmojiZWJ;
7110             return TransformRes.goOn;
7111         }
7112 
7113         state = GraphemeState.End;
7114         // There might still be spacing marks are
7115         // at the end, which are not allowed in
7116         // middle of emoji sequences
7117         return TransformRes.redo;
7118     },
7119 
7120     // GB11. Do not break within emoji modifier sequences or emoji
7121     // zwj sequences. This state applies when the last code point was
7122     // a ZWJ.
7123     GraphemeState.EmojiZWJ: (ref state, ch)
7124     {
7125         state = GraphemeState.Emoji;
7126         if (xpictoTrie[ch])
7127             return TransformRes.goOn;
7128         return TransformRes.redo;
7129     },
7130 
7131     // GB9b. Do not break after Prepend characters.
7132     GraphemeState.Prepend: (ref state, ch)
7133     {
7134         // GB5. Break before controls.
7135         if (graphemeControlTrie[ch] || ch == '\r' || ch == '\n')
7136             return TransformRes.retExclude;
7137 
7138         state = GraphemeState.Start;
7139         return TransformRes.redo;
7140     },
7141 
7142     // GB9, GB9a. Do not break before extending characters, ZWJ
7143     // or SpacingMarks.
7144     // GB999. Otherwise, break everywhere.
7145     GraphemeState.End: (ref state, ch)
7146         => !graphemeExtendTrie[ch] && !spacingMarkTrie[ch] && ch != '\u200D' ?
7147             TransformRes.retExclude :
7148             TransformRes.goOn
7149 ];
7150 
7151 template genericDecodeGrapheme(bool getValue)
7152 {
7153     static if (getValue)
7154         alias Value = Grapheme;
7155     else
7156         alias Value = void;
7157 
7158     Value genericDecodeGrapheme(Input)(ref Input range)
7159     {
7160         static if (getValue)
7161             Grapheme grapheme;
7162         auto state = GraphemeState.Start;
7163         dchar ch;
7164 
7165         assert(!range.empty, "Attempting to decode grapheme from an empty " ~ Input.stringof);
7166     outer:
7167         while (!range.empty)
7168         {
7169             ch = range.front;
7170 
7171         rerun:
7172             final switch (graphemeTransforms[state](state, ch))
7173                 with(TransformRes)
7174             {
7175             case goOn:
7176                 static if (getValue)
7177                     grapheme ~= ch;
7178                 range.popFront();
7179                 continue;
7180 
7181             case redo:
7182                 goto rerun;
7183 
7184             case retInclude:
7185                 static if (getValue)
7186                     grapheme ~= ch;
7187                 range.popFront();
7188                 break outer;
7189 
7190             case retExclude:
7191                 break outer;
7192             }
7193         }
7194 
7195         static if (getValue)
7196             return grapheme;
7197     }
7198 }
7199 
7200 public: // Public API continues
7201 
7202 /++
7203     Computes the length of grapheme cluster starting at `index`.
7204     Both the resulting length and the `index` are measured
7205     in $(S_LINK Code unit, code units).
7206 
7207     Params:
7208         C = type that is implicitly convertible to `dchars`
7209         input = array of grapheme clusters
7210         index = starting index into `input[]`
7211 
7212     Returns:
7213         length of grapheme cluster
7214 +/
7215 size_t graphemeStride(C)(const scope C[] input, size_t index) @safe pure
7216 if (is(C : dchar))
7217 {
7218     auto src = input[index..$];
7219     auto n = src.length;
7220     genericDecodeGrapheme!(false)(src);
7221     return n - src.length;
7222 }
7223 
7224 ///
7225 @safe unittest
7226 {
7227     assert(graphemeStride("  ", 1) == 1);
7228     // A + combing ring above
7229     string city = "A\u030Arhus";
7230     size_t first = graphemeStride(city, 0);
7231     assert(first == 3); //\u030A has 2 UTF-8 code units
7232     assert(city[0 .. first] == "A\u030A");
7233     assert(city[first..$] == "rhus");
7234 }
7235 
7236 @safe unittest
7237 {
7238     // Ensure that graphemeStride is usable from CTFE.
7239     enum c1 = graphemeStride("A", 0);
7240     static assert(c1 == 1);
7241 
7242     enum c2 = graphemeStride("A\u0301", 0);
7243     static assert(c2 == 3); // \u0301 has 2 UTF-8 code units
7244 }
7245 
7246 // TODO: make this @nogc. Probably no big deal since the state machine is
7247 // already GC-free.
7248 @safe pure nothrow unittest
7249 {
7250     // grinning face ~ emoji modifier fitzpatrick type-5 ~ grinning face
7251     assert(graphemeStride("\U0001F600\U0001f3FE\U0001F600"d, 0) == 2);
7252     // skier ~ female sign ~ '€'
7253     assert(graphemeStride("\u26F7\u2640€"d, 0) == 1);
7254     // skier ~ emoji modifier fitzpatrick type-5 ~ female sign ~ '€'
7255     assert(graphemeStride("\u26F7\U0001f3FE\u2640€"d, 0) == 2);
7256     // skier ~ zero-width joiner ~ female sign ~ '€'
7257     assert(graphemeStride("\u26F7\u200D\u2640€"d, 0) == 3);
7258     // skier ~ emoji modifier fitzpatrick type-5 ~ zero-width joiner
7259     // ~ female sign ~ '€'
7260     assert(graphemeStride("\u26F7\U0001f3FE\u200D\u2640€"d, 0) == 4);
7261     // skier ~ zero-width joiner ~ '€'
7262     assert(graphemeStride("\u26F7\u200D€"d, 0) == 2);
7263     //'€' ~ zero-width joiner ~ skier
7264     assert(graphemeStride("€\u200D\u26F7"d, 0) == 2);
7265     // Kaithi number sign ~ Devanagari digit four ~ Devanagari digit two
7266     assert(graphemeStride("\U000110BD\u096A\u0968"d, 0) == 2);
7267     // Kaithi number sign ~ null
7268     assert(graphemeStride("\U000110BD\0"d, 0) == 1);
7269 }
7270 
7271 /++
7272     Reads one full grapheme cluster from an
7273     $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of dchar `inp`.
7274 
7275     For examples see the $(LREF Grapheme) below.
7276 
7277     Note:
7278     This function modifies `inp` and thus `inp`
7279     must be an L-value.
7280 +/
7281 Grapheme decodeGrapheme(Input)(ref Input inp)
7282 if (isInputRange!Input && is(immutable ElementType!Input == immutable dchar))
7283 {
7284     return genericDecodeGrapheme!true(inp);
7285 }
7286 
7287 @safe unittest
7288 {
7289     import std.algorithm.comparison : equal;
7290 
7291     Grapheme gr;
7292     string s = " \u0020\u0308 ";
7293     gr = decodeGrapheme(s);
7294     assert(gr.length == 1 && gr[0] == ' ');
7295     gr = decodeGrapheme(s);
7296     assert(gr.length == 2 && equal(gr[0 .. 2], " \u0308"));
7297     s = "\u0300\u0308\u1100";
7298     assert(equal(decodeGrapheme(s)[], "\u0300\u0308"));
7299     assert(equal(decodeGrapheme(s)[], "\u1100"));
7300     s = "\u11A8\u0308\uAC01";
7301     assert(equal(decodeGrapheme(s)[], "\u11A8\u0308"));
7302     assert(equal(decodeGrapheme(s)[], "\uAC01"));
7303 
7304     // Two Union Jacks of the Great Britain
7305     s = "\U0001F1EC\U0001F1E7\U0001F1EC\U0001F1E7";
7306     assert(equal(decodeGrapheme(s)[], "\U0001F1EC\U0001F1E7"));
7307 }
7308 
7309 /++
7310     $(P Iterate a string by $(LREF Grapheme).)
7311 
7312     $(P Useful for doing string manipulation that needs to be aware
7313     of graphemes.)
7314 
7315     See_Also:
7316         $(LREF byCodePoint)
7317 +/
7318 auto byGrapheme(Range)(Range range)
7319 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7320 {
7321     // TODO: Bidirectional access
7322     static struct Result(R)
7323     {
7324         private R _range;
7325         private Grapheme _front;
7326 
7327         bool empty() @property
7328         {
7329             return _front.length == 0;
7330         }
7331 
7332         Grapheme front() @property
7333         {
7334             return _front;
7335         }
7336 
7337         void popFront()
7338         {
7339             _front = _range.empty ? Grapheme.init : _range.decodeGrapheme();
7340         }
7341 
7342         static if (isForwardRange!R)
7343         {
7344             Result save() @property
7345             {
7346                 return Result(_range.save, _front);
7347             }
7348         }
7349     }
7350 
7351     auto result = Result!(Range)(range);
7352     result.popFront();
7353     return result;
7354 }
7355 
7356 ///
7357 @safe unittest
7358 {
7359     import std.algorithm.comparison : equal;
7360     import std.range.primitives : walkLength;
7361     import std.range : take, drop;
7362     auto text = "noe\u0308l"; // noël using e + combining diaeresis
7363     assert(text.walkLength == 5); // 5 code points
7364 
7365     auto gText = text.byGrapheme;
7366     assert(gText.walkLength == 4); // 4 graphemes
7367 
7368     assert(gText.take(3).equal("noe\u0308".byGrapheme));
7369     assert(gText.drop(3).equal("l".byGrapheme));
7370 }
7371 
7372 // For testing non-forward-range input ranges
7373 version (StdUnittest)
7374 private static @safe struct InputRangeString
7375 {
7376     private string s;
7377 
7378     bool empty() @property { return s.empty; }
7379     dchar front() @property { return s.front; }
7380     void popFront() { s.popFront(); }
7381 }
7382 
7383 @safe unittest
7384 {
7385     import std.algorithm.comparison : equal;
7386     import std.array : array;
7387     import std.range : retro;
7388     import std.range.primitives : walkLength;
7389     assert("".byGrapheme.walkLength == 0);
7390 
7391     auto reverse = "le\u0308on";
7392     assert(reverse.walkLength == 5);
7393 
7394     auto gReverse = reverse.byGrapheme;
7395     assert(gReverse.walkLength == 4);
7396 
7397     static foreach (text; AliasSeq!("noe\u0308l"c, "noe\u0308l"w, "noe\u0308l"d))
7398     {{
7399         assert(text.walkLength == 5);
7400         static assert(isForwardRange!(typeof(text)));
7401 
7402         auto gText = text.byGrapheme;
7403         static assert(isForwardRange!(typeof(gText)));
7404         assert(gText.walkLength == 4);
7405         assert(gText.array.retro.equal(gReverse));
7406     }}
7407 
7408     auto nonForwardRange = InputRangeString("noe\u0308l").byGrapheme;
7409     static assert(!isForwardRange!(typeof(nonForwardRange)));
7410     assert(nonForwardRange.walkLength == 4);
7411 }
7412 
7413 // Issue 23474
7414 @safe pure unittest
7415 {
7416     import std.range.primitives : walkLength;
7417     assert(byGrapheme("\r\u0308").walkLength == 2);
7418 }
7419 
7420 /++
7421     $(P Lazily transform a range of $(LREF Grapheme)s to a range of code points.)
7422 
7423     $(P Useful for converting the result to a string after doing operations
7424     on graphemes.)
7425 
7426     $(P If passed in a range of code points, returns a range with equivalent capabilities.)
7427 +/
7428 auto byCodePoint(Range)(Range range)
7429 if (isInputRange!Range && is(immutable ElementType!Range == immutable Grapheme))
7430 {
7431     // TODO: Propagate bidirectional access
7432     static struct Result
7433     {
7434         private Range _range;
7435         private size_t i = 0;
7436 
7437         bool empty() @property
7438         {
7439             return _range.empty;
7440         }
7441 
7442         dchar front() @property
7443         {
7444             return _range.front[i];
7445         }
7446 
7447         void popFront()
7448         {
7449             ++i;
7450 
7451             if (i >= _range.front.length)
7452             {
7453                 _range.popFront();
7454                 i = 0;
7455             }
7456         }
7457 
7458         static if (isForwardRange!Range)
7459         {
7460             Result save() @property
7461             {
7462                 return Result(_range.save, i);
7463             }
7464         }
7465     }
7466 
7467     return Result(range);
7468 }
7469 
7470 /// Ditto
7471 auto byCodePoint(Range)(Range range)
7472 if (isInputRange!Range && is(immutable ElementType!Range == immutable dchar))
7473 {
7474     import std.range.primitives : isBidirectionalRange, popBack;
7475     import std.traits : isNarrowString;
7476     static if (isNarrowString!Range)
7477     {
7478         static struct Result
7479         {
7480             private Range _range;
7481             @property bool empty() { return _range.empty; }
7482             @property dchar front(){ return _range.front; }
7483             void popFront(){ _range.popFront; }
7484             @property auto save() { return Result(_range.save); }
7485             @property dchar back(){ return _range.back; }
7486             void popBack(){ _range.popBack; }
7487         }
7488         static assert(isBidirectionalRange!(Result));
7489         return Result(range);
7490     }
7491     else
7492         return range;
7493 }
7494 
7495 ///
7496 @safe unittest
7497 {
7498     import std.array : array;
7499     import std.conv : text;
7500     import std.range : retro;
7501 
7502     string s = "noe\u0308l"; // noël
7503 
7504     // reverse it and convert the result to a string
7505     string reverse = s.byGrapheme
7506         .array
7507         .retro
7508         .byCodePoint
7509         .text;
7510 
7511     assert(reverse == "le\u0308on"); // lëon
7512 }
7513 
7514 @safe unittest
7515 {
7516     import std.algorithm.comparison : equal;
7517     import std.range.primitives : walkLength;
7518     import std.range : retro;
7519     assert("".byGrapheme.byCodePoint.equal(""));
7520 
7521     string text = "noe\u0308l";
7522     static assert(!__traits(compiles, "noe\u0308l".byCodePoint.length));
7523 
7524     auto gText = InputRangeString(text).byGrapheme;
7525     static assert(!isForwardRange!(typeof(gText)));
7526 
7527     auto cpText = gText.byCodePoint;
7528     static assert(!isForwardRange!(typeof(cpText)));
7529 
7530     assert(cpText.walkLength == text.walkLength);
7531 
7532     auto plainCp = text.byCodePoint;
7533     static assert(isForwardRange!(typeof(plainCp)));
7534     assert(equal(plainCp, text));
7535     assert(equal(retro(plainCp.save), retro(text.save)));
7536     // Check that we still have length for dstring
7537     assert("абвгд"d.byCodePoint.length == 5);
7538 }
7539 
7540 /++
7541     $(P A structure designed to effectively pack $(CHARACTERS)
7542     of a $(CLUSTER).
7543     )
7544 
7545     $(P `Grapheme` has value semantics so 2 copies of a `Grapheme`
7546     always refer to distinct objects. In most actual scenarios a `Grapheme`
7547     fits on the stack and avoids memory allocation overhead for all but quite
7548     long clusters.
7549     )
7550 
7551     See_Also: $(LREF decodeGrapheme), $(LREF graphemeStride)
7552 +/
7553 @safe struct Grapheme
7554 {
7555     import std.exception : enforce;
7556     import std.traits : isDynamicArray;
7557 
7558 public:
7559     /// Ctor
7560     this(C)(const scope C[] chars...)
7561         if (is(C : dchar))
7562     {
7563         this ~= chars;
7564     }
7565 
7566     ///ditto
7567     this(Input)(Input seq)
7568         if (!isDynamicArray!Input
7569             && isInputRange!Input && is(ElementType!Input : dchar))
7570     {
7571         this ~= seq;
7572     }
7573 
7574     /// Gets a $(CODEPOINT) at the given index in this cluster.
7575     dchar opIndex(size_t index) const @nogc nothrow pure @trusted
7576     {
7577         assert(index < length);
7578         return read24(isBig ? ptr_ : small_.ptr, index);
7579     }
7580 
7581     /++
7582         Writes a $(CODEPOINT) `ch` at given index in this cluster.
7583 
7584         Warning:
7585         Use of this facility may invalidate grapheme cluster,
7586         see also $(LREF Grapheme.valid).
7587     +/
7588     void opIndexAssign(dchar ch, size_t index) @nogc nothrow pure @trusted
7589     {
7590         assert(index < length);
7591         write24(isBig ? ptr_ : small_.ptr, ch, index);
7592     }
7593 
7594     ///
7595     @safe unittest
7596     {
7597         auto g = Grapheme("A\u0302");
7598         assert(g[0] == 'A');
7599         assert(g.valid);
7600         g[1] = '~'; // ASCII tilda is not a combining mark
7601         assert(g[1] == '~');
7602         assert(!g.valid);
7603     }
7604 
7605     /++
7606         Random-access range over Grapheme's $(CHARACTERS).
7607 
7608         Warning: Invalidates when this Grapheme leaves the scope,
7609         attempts to use it then would lead to memory corruption.
7610     +/
7611     SliceOverIndexed!Grapheme opSlice(size_t a, size_t b) @nogc nothrow pure return
7612     {
7613         return sliceOverIndexed(a, b, &this);
7614     }
7615 
7616     /// ditto
7617     SliceOverIndexed!Grapheme opSlice() @nogc nothrow pure return
7618     {
7619         return sliceOverIndexed(0, length, &this);
7620     }
7621 
7622     /// Grapheme cluster length in $(CODEPOINTS).
7623     @property size_t length() const @nogc nothrow pure
7624     {
7625         return isBig ? len_ : slen_ & 0x7F;
7626     }
7627 
7628     /++
7629         Append $(CHARACTER) `ch` to this grapheme.
7630         Warning:
7631         Use of this facility may invalidate grapheme cluster,
7632         see also `valid`.
7633 
7634         See_Also: $(LREF Grapheme.valid)
7635     +/
7636     ref opOpAssign(string op)(dchar ch) @trusted
7637     {
7638         static if (op == "~")
7639         {
7640             import std.internal.memory : enforceRealloc;
7641             if (!isBig)
7642             {
7643                 if (slen_ == small_cap)
7644                     convertToBig();// & fallthrough to "big" branch
7645                 else
7646                 {
7647                     write24(small_.ptr, ch, smallLength);
7648                     slen_++;
7649                     return this;
7650                 }
7651             }
7652 
7653             assert(isBig);
7654             if (len_ == cap_)
7655             {
7656                 import core.checkedint : addu, mulu;
7657                 bool overflow;
7658                 cap_ = addu(cap_, grow, overflow);
7659                 auto nelems = mulu(3, addu(cap_, 1, overflow), overflow);
7660                 if (overflow) assert(0);
7661                 ptr_ = cast(ubyte*) enforceRealloc(ptr_, nelems);
7662             }
7663             write24(ptr_, ch, len_++);
7664             return this;
7665         }
7666         else
7667             static assert(false, "No operation "~op~" defined for Grapheme");
7668     }
7669 
7670     ///
7671     @safe unittest
7672     {
7673         import std.algorithm.comparison : equal;
7674         auto g = Grapheme("A");
7675         assert(g.valid);
7676         g ~= '\u0301';
7677         assert(g[].equal("A\u0301"));
7678         assert(g.valid);
7679         g ~= "B";
7680         // not a valid grapheme cluster anymore
7681         assert(!g.valid);
7682         // still could be useful though
7683         assert(g[].equal("A\u0301B"));
7684     }
7685 
7686     /// Append all $(CHARACTERS) from the input range `inp` to this Grapheme.
7687     ref opOpAssign(string op, Input)(scope Input inp)
7688         if (isInputRange!Input && is(ElementType!Input : dchar))
7689     {
7690         static if (op == "~")
7691         {
7692             foreach (dchar ch; inp)
7693                 this ~= ch;
7694             return this;
7695         }
7696         else
7697             static assert(false, "No operation "~op~" defined for Grapheme");
7698     }
7699 
7700     // This is not a good `opEquals`, but formerly the automatically generated
7701     // opEquals was used, which was inferred `@safe` because of bugzilla 20655:
7702     // https://issues.dlang.org/show_bug.cgi?id=20655
7703     // This `@trusted opEquals` is only here to prevent breakage.
7704     bool opEquals(R)(const auto ref R other) const @trusted
7705     {
7706         return this.tupleof == other.tupleof;
7707     }
7708 
7709     /++
7710         True if this object contains valid extended grapheme cluster.
7711         Decoding primitives of this module always return a valid `Grapheme`.
7712 
7713         Appending to and direct manipulation of grapheme's $(CHARACTERS) may
7714         render it no longer valid. Certain applications may chose to use
7715         Grapheme as a "small string" of any $(CODEPOINTS) and ignore this property
7716         entirely.
7717     +/
7718     @property bool valid()() /*const*/
7719     {
7720         auto r = this[];
7721         genericDecodeGrapheme!false(r);
7722         return r.length == 0;
7723     }
7724 
7725     this(this) @nogc nothrow pure @trusted
7726     {
7727         import std.internal.memory : enforceMalloc;
7728         if (isBig)
7729         {// dup it
7730             import core.checkedint : addu, mulu;
7731             bool overflow;
7732             auto raw_cap = mulu(3, addu(cap_, 1, overflow), overflow);
7733             if (overflow) assert(0);
7734 
7735             auto p = cast(ubyte*) enforceMalloc(raw_cap);
7736             p[0 .. raw_cap] = ptr_[0 .. raw_cap];
7737             ptr_ = p;
7738         }
7739     }
7740 
7741     ~this() @nogc nothrow pure @trusted
7742     {
7743         import core.memory : pureFree;
7744         if (isBig)
7745         {
7746             pureFree(ptr_);
7747         }
7748     }
7749 
7750 
7751 private:
7752     enum small_bytes = ((ubyte*).sizeof+3*size_t.sizeof-1);
7753     // "out of the blue" grow rate, needs testing
7754     // (though graphemes are typically small < 9)
7755     enum grow = 20;
7756     enum small_cap = small_bytes/3;
7757     enum small_flag = 0x80, small_mask = 0x7F;
7758     // 16 bytes in 32bits, should be enough for the majority of cases
7759     union
7760     {
7761         struct
7762         {
7763             ubyte* ptr_;
7764             size_t cap_;
7765             size_t len_;
7766             size_t padding_;
7767         }
7768         struct
7769         {
7770             ubyte[small_bytes] small_;
7771             ubyte slen_;
7772         }
7773     }
7774 
7775     void convertToBig() @nogc nothrow pure @trusted
7776     {
7777         import std.internal.memory : enforceMalloc;
7778         static assert(grow.max / 3 - 1 >= grow);
7779         enum nbytes = 3 * (grow + 1);
7780         size_t k = smallLength;
7781         ubyte* p = cast(ubyte*) enforceMalloc(nbytes);
7782         for (int i=0; i<k; i++)
7783             write24(p, read24(small_.ptr, i), i);
7784         // now we can overwrite small array data
7785         ptr_ = p;
7786         len_ = slen_;
7787         assert(grow > len_);
7788         cap_ = grow;
7789         setBig();
7790     }
7791 
7792     void setBig() @nogc nothrow pure { slen_ |= small_flag; }
7793 
7794     @property size_t smallLength() const @nogc nothrow pure
7795     {
7796         return slen_ & small_mask;
7797     }
7798     @property ubyte isBig() const @nogc nothrow pure
7799     {
7800         return slen_ & small_flag;
7801     }
7802 }
7803 
7804 static assert(Grapheme.sizeof == size_t.sizeof*4);
7805 
7806 
7807 @safe pure /*nothrow @nogc*/ unittest // TODO: string .front is GC and throw
7808 {
7809     import std.algorithm.comparison : equal;
7810     Grapheme[3] data = [Grapheme("Ю"), Grapheme("У"), Grapheme("З")];
7811     assert(byGrapheme("ЮУЗ").equal(data[]));
7812 }
7813 
7814 ///
7815 @safe unittest
7816 {
7817     import std.algorithm.comparison : equal;
7818     import std.algorithm.iteration : filter;
7819     import std.range : isRandomAccessRange;
7820 
7821     string bold = "ku\u0308hn";
7822 
7823     // note that decodeGrapheme takes parameter by ref
7824     auto first = decodeGrapheme(bold);
7825 
7826     assert(first.length == 1);
7827     assert(first[0] == 'k');
7828 
7829     // the next grapheme is 2 characters long
7830     auto wideOne = decodeGrapheme(bold);
7831     // slicing a grapheme yields a random-access range of dchar
7832     assert(wideOne[].equal("u\u0308"));
7833     assert(wideOne.length == 2);
7834     static assert(isRandomAccessRange!(typeof(wideOne[])));
7835 
7836     // all of the usual range manipulation is possible
7837     assert(wideOne[].filter!isMark().equal("\u0308"));
7838 
7839     auto g = Grapheme("A");
7840     assert(g.valid);
7841     g ~= '\u0301';
7842     assert(g[].equal("A\u0301"));
7843     assert(g.valid);
7844     g ~= "B";
7845     // not a valid grapheme cluster anymore
7846     assert(!g.valid);
7847     // still could be useful though
7848     assert(g[].equal("A\u0301B"));
7849 }
7850 
7851 @safe unittest
7852 {
7853     auto g = Grapheme("A\u0302");
7854     assert(g[0] == 'A');
7855     assert(g.valid);
7856     g[1] = '~'; // ASCII tilda is not a combining mark
7857     assert(g[1] == '~');
7858     assert(!g.valid);
7859 }
7860 
7861 @safe unittest
7862 {
7863     import std.algorithm.comparison : equal;
7864     import std.algorithm.iteration : map;
7865     import std.conv : text;
7866     import std.range : iota;
7867 
7868     // not valid clusters (but it just a test)
7869     auto g  = Grapheme('a', 'b', 'c', 'd', 'e');
7870     assert(g[0] == 'a');
7871     assert(g[1] == 'b');
7872     assert(g[2] == 'c');
7873     assert(g[3] == 'd');
7874     assert(g[4] == 'e');
7875     g[3] = 'Й';
7876     assert(g[2] == 'c');
7877     assert(g[3] == 'Й', text(g[3], " vs ", 'Й'));
7878     assert(g[4] == 'e');
7879     assert(!g.valid);
7880 
7881     g ~= 'ц';
7882     g ~= '~';
7883     assert(g[0] == 'a');
7884     assert(g[1] == 'b');
7885     assert(g[2] == 'c');
7886     assert(g[3] == 'Й');
7887     assert(g[4] == 'e');
7888     assert(g[5] == 'ц');
7889     assert(g[6] == '~');
7890     assert(!g.valid);
7891 
7892     Grapheme copy = g;
7893     copy[0] = 'X';
7894     copy[1] = '-';
7895     assert(g[0] == 'a' && copy[0] == 'X');
7896     assert(g[1] == 'b' && copy[1] == '-');
7897     assert(equal(g[2 .. g.length], copy[2 .. copy.length]));
7898     copy = Grapheme("АБВГДЕЁЖЗИКЛМ");
7899     assert(equal(copy[0 .. 8], "АБВГДЕЁЖ"), text(copy[0 .. 8]));
7900     copy ~= "xyz";
7901     assert(equal(copy[13 .. 15], "xy"), text(copy[13 .. 15]));
7902     assert(!copy.valid);
7903 
7904     Grapheme h;
7905     foreach (dchar v; iota(cast(int)'A', cast(int)'Z'+1).map!"cast(dchar)a"())
7906         h ~= v;
7907     assert(equal(h[], iota(cast(int)'A', cast(int)'Z'+1)));
7908 }
7909 
7910 /++
7911     $(P Does basic case-insensitive comparison of `r1` and `r2`.
7912     This function uses simpler comparison rule thus achieving better performance
7913     than $(LREF icmp). However keep in mind the warning below.)
7914 
7915     Params:
7916         r1 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7917         r2 = an $(REF_ALTTEXT input range, isInputRange, std,range,primitives) of characters
7918 
7919     Returns:
7920         An `int` that is 0 if the strings match,
7921         &lt;0 if `r1` is lexicographically "less" than `r2`,
7922         &gt;0 if `r1` is lexicographically "greater" than `r2`
7923 
7924     Warning:
7925     This function only handles 1:1 $(CODEPOINT) mapping
7926     and thus is not sufficient for certain alphabets
7927     like German, Greek and few others.
7928 
7929     See_Also:
7930         $(LREF icmp)
7931         $(REF cmp, std,algorithm,comparison)
7932 +/
7933 int sicmp(S1, S2)(scope S1 r1, scope S2 r2)
7934 if (isInputRange!S1 && isSomeChar!(ElementEncodingType!S1)
7935     && isInputRange!S2 && isSomeChar!(ElementEncodingType!S2))
7936 {
7937     import std.internal.unicode_tables : sTable = simpleCaseTable; // generated file
7938     import std.range.primitives : isInfinite;
7939     import std.utf : decodeFront;
7940     import std.traits : isDynamicArray;
7941     import std.typecons : Yes;
7942     static import std.ascii;
7943 
7944     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
7945         && (isDynamicArray!S2 || isRandomAccessRange!S2)
7946         && !(isInfinite!S1 && isInfinite!S2)
7947         && __traits(compiles,
7948             {
7949                 size_t s = size_t.sizeof / 2;
7950                 r1 = r1[s .. $];
7951                 r2 = r2[s .. $];
7952             }))
7953     {{
7954         // ASCII optimization for dynamic arrays & similar.
7955         size_t i = 0;
7956         static if (isInfinite!S1)
7957             immutable end = r2.length;
7958         else static if (isInfinite!S2)
7959             immutable end = r1.length;
7960         else
7961             immutable end = r1.length > r2.length ? r2.length : r1.length;
7962         for (; i < end; ++i)
7963         {
7964             auto lhs = r1[i];
7965             auto rhs = r2[i];
7966             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
7967             if (lhs == rhs) continue;
7968             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7969             if (lowDiff) return lowDiff;
7970         }
7971         static if (isInfinite!S1)
7972             return 1;
7973         else static if (isInfinite!S2)
7974             return -1;
7975         else
7976             return (r1.length > r2.length) - (r2.length > r1.length);
7977 
7978     NonAsciiPath:
7979         r1 = r1[i .. $];
7980         r2 = r2[i .. $];
7981         // Fall through to standard case.
7982     }}
7983 
7984     while (!r1.empty)
7985     {
7986         immutable lhs = decodeFront!(Yes.useReplacementDchar)(r1);
7987         if (r2.empty)
7988             return 1;
7989         immutable rhs = decodeFront!(Yes.useReplacementDchar)(r2);
7990         int diff = lhs - rhs;
7991         if (!diff)
7992             continue;
7993         if ((lhs | rhs) < 0x80)
7994         {
7995             immutable d = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
7996             if (!d) continue;
7997             return d;
7998         }
7999         size_t idx = simpleCaseTrie[lhs];
8000         size_t idx2 = simpleCaseTrie[rhs];
8001         // simpleCaseTrie is packed index table
8002         if (idx != EMPTY_CASE_TRIE)
8003         {
8004             if (idx2 != EMPTY_CASE_TRIE)
8005             {// both cased chars
8006                 // adjust idx --> start of bucket
8007                 idx = idx - sTable[idx].n;
8008                 idx2 = idx2 - sTable[idx2].n;
8009                 if (idx == idx2)// one bucket, equivalent chars
8010                     continue;
8011                 else//  not the same bucket
8012                     diff = sTable[idx].ch - sTable[idx2].ch;
8013             }
8014             else
8015                 diff = sTable[idx - sTable[idx].n].ch - rhs;
8016         }
8017         else if (idx2 != EMPTY_CASE_TRIE)
8018         {
8019             diff = lhs - sTable[idx2 - sTable[idx2].n].ch;
8020         }
8021         // one of chars is not cased at all
8022         return diff;
8023     }
8024     return int(r2.empty) - 1;
8025 }
8026 
8027 ///
8028 @safe @nogc pure nothrow unittest
8029 {
8030     assert(sicmp("Август", "авгусТ") == 0);
8031     // Greek also works as long as there is no 1:M mapping in sight
8032     assert(sicmp("ΌΎ", "όύ") == 0);
8033     // things like the following won't get matched as equal
8034     // Greek small letter iota with dialytika and tonos
8035     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8036 
8037     // while icmp has no problem with that
8038     assert(icmp("ΐ", "\u03B9\u0308\u0301") == 0);
8039     assert(icmp("ΌΎ", "όύ") == 0);
8040 }
8041 
8042 // overloads for the most common cases to reduce compile time
8043 @safe @nogc pure nothrow
8044 {
8045     int sicmp(scope const(char)[] str1, scope const(char)[] str2)
8046     { return sicmp!(const(char)[], const(char)[])(str1, str2); }
8047 
8048     int sicmp(scope const(wchar)[] str1, scope const(wchar)[] str2)
8049     { return sicmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8050 
8051     int sicmp(scope const(dchar)[] str1, scope const(dchar)[] str2)
8052     { return sicmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8053 }
8054 
8055 private int fullCasedCmp(Range)(dchar lhs, dchar rhs, ref Range rtail)
8056 {
8057     import std.algorithm.searching : skipOver;
8058     import std.internal.unicode_tables : fullCaseTable; // generated file
8059     alias fTable = fullCaseTable;
8060     size_t idx = fullCaseTrie[lhs];
8061     // fullCaseTrie is packed index table
8062     if (idx == EMPTY_CASE_TRIE)
8063         return lhs;
8064     immutable start = idx - fTable[idx].n;
8065     immutable end = fTable[idx].size + start;
8066     assert(fTable[start].entry_len == 1);
8067     for (idx=start; idx<end; idx++)
8068     {
8069         auto entryLen = fTable[idx].entry_len;
8070         if (entryLen == 1)
8071         {
8072             if (fTable[idx].seq[0] == rhs)
8073             {
8074                 return 0;
8075             }
8076         }
8077         else
8078         {// OK it's a long chunk, like 'ss' for German
8079             dstring seq = fTable[idx].seq[0 .. entryLen];
8080             if (rhs == seq[0]
8081                 && rtail.skipOver(seq[1..$]))
8082             {
8083                 // note that this path modifies rtail
8084                 // iff we managed to get there
8085                 return 0;
8086             }
8087         }
8088     }
8089     return fTable[start].seq[0]; // new remapped character for accurate diffs
8090 }
8091 
8092 /++
8093     Does case insensitive comparison of `r1` and `r2`.
8094     Follows the rules of full case-folding mapping.
8095     This includes matching as equal german ß with "ss" and
8096     other 1:M $(CODEPOINT) mappings unlike $(LREF sicmp).
8097     The cost of `icmp` being pedantically correct is
8098     slightly worse performance.
8099 
8100     Params:
8101         r1 = a forward range of characters
8102         r2 = a forward range of characters
8103 
8104     Returns:
8105         An `int` that is 0 if the strings match,
8106         &lt;0 if `str1` is lexicographically "less" than `str2`,
8107         &gt;0 if `str1` is lexicographically "greater" than `str2`
8108 
8109     See_Also:
8110         $(LREF sicmp)
8111         $(REF cmp, std,algorithm,comparison)
8112 +/
8113 int icmp(S1, S2)(S1 r1, S2 r2)
8114 if (isForwardRange!S1 && isSomeChar!(ElementEncodingType!S1)
8115     && isForwardRange!S2 && isSomeChar!(ElementEncodingType!S2))
8116 {
8117     import std.range.primitives : isInfinite;
8118     import std.traits : isDynamicArray;
8119     import std.utf : byDchar;
8120     static import std.ascii;
8121 
8122     static if ((isDynamicArray!S1 || isRandomAccessRange!S1)
8123         && (isDynamicArray!S2 || isRandomAccessRange!S2)
8124         && !(isInfinite!S1 && isInfinite!S2)
8125         && __traits(compiles,
8126             {
8127                 size_t s = size_t.max / 2;
8128                 r1 = r1[s .. $];
8129                 r2 = r2[s .. $];
8130             }))
8131     {{
8132         // ASCII optimization for dynamic arrays & similar.
8133         size_t i = 0;
8134         static if (isInfinite!S1)
8135             immutable end = r2.length;
8136         else static if (isInfinite!S2)
8137             immutable end = r1.length;
8138         else
8139             immutable end = r1.length > r2.length ? r2.length : r1.length;
8140         for (; i < end; ++i)
8141         {
8142             auto lhs = r1[i];
8143             auto rhs = r2[i];
8144             if ((lhs | rhs) >= 0x80) goto NonAsciiPath;
8145             if (lhs == rhs) continue;
8146             auto lowDiff = std.ascii.toLower(lhs) - std.ascii.toLower(rhs);
8147             if (lowDiff) return lowDiff;
8148         }
8149         static if (isInfinite!S1)
8150             return 1;
8151         else static if (isInfinite!S2)
8152             return -1;
8153         else
8154             return (r1.length > r2.length) - (r2.length > r1.length);
8155 
8156     NonAsciiPath:
8157         r1 = r1[i .. $];
8158         r2 = r2[i .. $];
8159         // Fall through to standard case.
8160     }}
8161 
8162     auto str1 = r1.byDchar;
8163     auto str2 = r2.byDchar;
8164 
8165     for (;;)
8166     {
8167         if (str1.empty)
8168             return str2.empty ? 0 : -1;
8169         immutable lhs = str1.front;
8170         if (str2.empty)
8171             return 1;
8172         immutable rhs = str2.front;
8173         str1.popFront();
8174         str2.popFront();
8175         if (!(lhs - rhs))
8176             continue;
8177         // first try to match lhs to <rhs,right-tail> sequence
8178         immutable cmpLR = fullCasedCmp(lhs, rhs, str2);
8179         if (!cmpLR)
8180             continue;
8181         // then rhs to <lhs,left-tail> sequence
8182         immutable cmpRL = fullCasedCmp(rhs, lhs, str1);
8183         if (!cmpRL)
8184             continue;
8185         // cmpXX contain remapped codepoints
8186         // to obtain stable ordering of icmp
8187         return cmpLR - cmpRL;
8188     }
8189 }
8190 
8191 ///
8192 @safe @nogc pure nothrow unittest
8193 {
8194     assert(icmp("Rußland", "Russland") == 0);
8195     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8196 }
8197 
8198 /**
8199  * By using $(REF byUTF, std,utf) and its aliases, GC allocations via auto-decoding
8200  * and thrown exceptions can be avoided, making `icmp` `@safe @nogc nothrow pure`.
8201  */
8202 @safe @nogc nothrow pure unittest
8203 {
8204     import std.utf : byDchar;
8205 
8206     assert(icmp("Rußland".byDchar, "Russland".byDchar) == 0);
8207     assert(icmp("ᾩ -> \u1F70\u03B9".byDchar, "\u1F61\u03B9 -> ᾲ".byDchar) == 0);
8208 }
8209 
8210 // test different character types
8211 @safe unittest
8212 {
8213     assert(icmp("Rußland", "Russland") == 0);
8214     assert(icmp("Rußland"w, "Russland") == 0);
8215     assert(icmp("Rußland", "Russland"w) == 0);
8216     assert(icmp("Rußland"w, "Russland"w) == 0);
8217     assert(icmp("Rußland"d, "Russland"w) == 0);
8218     assert(icmp("Rußland"w, "Russland"d) == 0);
8219 }
8220 
8221 // overloads for the most common cases to reduce compile time
8222 @safe @nogc pure nothrow
8223 {
8224     int icmp(const(char)[] str1, const(char)[] str2)
8225     { return icmp!(const(char)[], const(char)[])(str1, str2); }
8226     int icmp(const(wchar)[] str1, const(wchar)[] str2)
8227     { return icmp!(const(wchar)[], const(wchar)[])(str1, str2); }
8228     int icmp(const(dchar)[] str1, const(dchar)[] str2)
8229     { return icmp!(const(dchar)[], const(dchar)[])(str1, str2); }
8230 }
8231 
8232 @safe unittest
8233 {
8234     import std.algorithm.sorting : sort;
8235     import std.conv : to;
8236     import std.exception : assertCTFEable;
8237     assertCTFEable!(
8238     {
8239     static foreach (cfunc; AliasSeq!(icmp, sicmp))
8240     {{
8241         static foreach (S1; AliasSeq!(string, wstring, dstring))
8242         static foreach (S2; AliasSeq!(string, wstring, dstring))
8243         {
8244             assert(cfunc("".to!S1(), "".to!S2()) == 0);
8245             assert(cfunc("A".to!S1(), "".to!S2()) > 0);
8246             assert(cfunc("".to!S1(), "0".to!S2()) < 0);
8247             assert(cfunc("abc".to!S1(), "abc".to!S2()) == 0);
8248             assert(cfunc("abcd".to!S1(), "abc".to!S2()) > 0);
8249             assert(cfunc("abc".to!S1(), "abcd".to!S2()) < 0);
8250             assert(cfunc("Abc".to!S1(), "aBc".to!S2()) == 0);
8251             assert(cfunc("авГуст".to!S1(), "АВгУСТ".to!S2()) == 0);
8252             // Check example:
8253             assert(cfunc("Август".to!S1(), "авгусТ".to!S2()) == 0);
8254             assert(cfunc("ΌΎ".to!S1(), "όύ".to!S2()) == 0);
8255         }
8256         // check that the order is properly agnostic to the case
8257         auto strs = [ "Apple", "ORANGE",  "orAcle", "amp", "banana"];
8258         sort!((a,b) => cfunc(a,b) < 0)(strs);
8259         assert(strs == ["amp", "Apple",  "banana", "orAcle", "ORANGE"]);
8260     }}
8261     assert(icmp("ßb", "ssa") > 0);
8262     // Check example:
8263     assert(icmp("Russland", "Rußland") == 0);
8264     assert(icmp("ᾩ -> \u1F70\u03B9", "\u1F61\u03B9 -> ᾲ") == 0);
8265     assert(icmp("ΐ"w, "\u03B9\u0308\u0301") == 0);
8266     assert(sicmp("ΐ", "\u03B9\u0308\u0301") != 0);
8267     // https://issues.dlang.org/show_bug.cgi?id=11057
8268     assert( icmp("K", "L") < 0 );
8269     });
8270 }
8271 
8272 // https://issues.dlang.org/show_bug.cgi?id=17372
8273 @safe pure unittest
8274 {
8275     import std.algorithm.iteration : joiner, map;
8276     import std.algorithm.sorting : sort;
8277     import std.array : array;
8278     auto a = [["foo", "bar"], ["baz"]].map!(line => line.joiner(" ")).array.sort!((a, b) => icmp(a, b) < 0);
8279 }
8280 
8281 // This is package(std) for the moment to be used as a support tool for std.regex
8282 // It needs a better API
8283 /*
8284     Return a range of all $(CODEPOINTS) that casefold to
8285     and from this `ch`.
8286 */
8287 package(std) auto simpleCaseFoldings(dchar ch) @safe
8288 {
8289     import std.internal.unicode_tables : simpleCaseTable; // generated file
8290     alias sTable = simpleCaseTable;
8291     static struct Range
8292     {
8293     @safe pure nothrow:
8294         uint idx; //if == uint.max, then read c.
8295         union
8296         {
8297             dchar c; // == 0 - empty range
8298             uint len;
8299         }
8300         @property bool isSmall() const { return idx == uint.max; }
8301 
8302         this(dchar ch)
8303         {
8304             idx = uint.max;
8305             c = ch;
8306         }
8307 
8308         this(uint start, uint size)
8309         {
8310             idx = start;
8311             len = size;
8312         }
8313 
8314         @property dchar front() const
8315         {
8316             assert(!empty);
8317             if (isSmall)
8318             {
8319                 return c;
8320             }
8321             auto ch = sTable[idx].ch;
8322             return ch;
8323         }
8324 
8325         @property bool empty() const
8326         {
8327             if (isSmall)
8328             {
8329                 return c == 0;
8330             }
8331             return len == 0;
8332         }
8333 
8334         @property size_t length() const
8335         {
8336             if (isSmall)
8337             {
8338                 return c == 0 ? 0 : 1;
8339             }
8340             return len;
8341         }
8342 
8343         void popFront()
8344         {
8345             if (isSmall)
8346                 c = 0;
8347             else
8348             {
8349                 idx++;
8350                 len--;
8351             }
8352         }
8353     }
8354     immutable idx = simpleCaseTrie[ch];
8355     if (idx == EMPTY_CASE_TRIE)
8356         return Range(ch);
8357     auto entry = sTable[idx];
8358     immutable start = idx - entry.n;
8359     return Range(start, entry.size);
8360 }
8361 
8362 @safe unittest
8363 {
8364     import std.algorithm.comparison : equal;
8365     import std.algorithm.searching : canFind;
8366     import std.array : array;
8367     import std.exception : assertCTFEable;
8368     assertCTFEable!((){
8369         auto r = simpleCaseFoldings('Э').array;
8370         assert(r.length == 2);
8371         assert(r.canFind('э') && r.canFind('Э'));
8372         auto sr = simpleCaseFoldings('~');
8373         assert(sr.equal("~"));
8374         //A with ring above - casefolds to the same bucket as Angstrom sign
8375         sr = simpleCaseFoldings('Å');
8376         assert(sr.length == 3);
8377         assert(sr.canFind('å') && sr.canFind('Å') && sr.canFind('\u212B'));
8378     });
8379 }
8380 
8381 /++
8382     $(P Returns the $(S_LINK Combining class, combining class) of `ch`.)
8383 +/
8384 ubyte combiningClass(dchar ch) @safe pure nothrow @nogc
8385 {
8386     return combiningClassTrie[ch];
8387 }
8388 
8389 ///
8390 @safe unittest
8391 {
8392     // shorten the code
8393     alias CC = combiningClass;
8394 
8395     // combining tilda
8396     assert(CC('\u0303') == 230);
8397     // combining ring below
8398     assert(CC('\u0325') == 220);
8399     // the simple consequence is that  "tilda" should be
8400     // placed after a "ring below" in a sequence
8401 }
8402 
8403 @safe pure nothrow @nogc unittest
8404 {
8405     foreach (ch; 0 .. 0x80)
8406         assert(combiningClass(ch) == 0);
8407     assert(combiningClass('\u05BD') == 22);
8408     assert(combiningClass('\u0300') == 230);
8409     assert(combiningClass('\u0317') == 220);
8410     assert(combiningClass('\u1939') == 222);
8411 }
8412 
8413 /// Unicode character decomposition type.
8414 enum UnicodeDecomposition {
8415     /// Canonical decomposition. The result is canonically equivalent sequence.
8416     Canonical,
8417     /**
8418          Compatibility decomposition. The result is compatibility equivalent sequence.
8419          Note: Compatibility decomposition is a $(B lossy) conversion,
8420          typically suitable only for fuzzy matching and internal processing.
8421     */
8422     Compatibility
8423 }
8424 
8425 /**
8426     Shorthand aliases for character decomposition type, passed as a
8427     template parameter to $(LREF decompose).
8428 */
8429 enum {
8430     Canonical = UnicodeDecomposition.Canonical,
8431     Compatibility = UnicodeDecomposition.Compatibility
8432 }
8433 
8434 /++
8435     Try to canonically compose 2 $(CHARACTERS).
8436     Returns the composed $(CHARACTER) if they do compose and dchar.init otherwise.
8437 
8438     The assumption is that `first` comes before `second` in the original text,
8439     usually meaning that the first is a starter.
8440 
8441     Note: Hangul syllables are not covered by this function.
8442     See `composeJamo` below.
8443 +/
8444 public dchar compose(dchar first, dchar second) pure nothrow @safe
8445 {
8446     import std.algorithm.iteration : map;
8447     import std.internal.unicode_comp : compositionTable, composeCntShift, composeIdxMask;
8448     import std.range : assumeSorted;
8449     immutable packed = compositionJumpTrie[first];
8450     if (packed == ushort.max)
8451         return dchar.init;
8452     // unpack offset and length
8453     immutable idx = packed & composeIdxMask, cnt = packed >> composeCntShift;
8454     // TODO: optimize this micro binary search (no more then 4-5 steps)
8455     auto r = compositionTable[idx .. idx+cnt].map!"a.rhs"().assumeSorted();
8456     immutable target = r.lowerBound(second).length;
8457     if (target == cnt)
8458         return dchar.init;
8459     immutable entry = compositionTable[idx+target];
8460     if (entry.rhs != second)
8461         return dchar.init;
8462     return entry.composed;
8463 }
8464 
8465 ///
8466 @safe unittest
8467 {
8468     assert(compose('A','\u0308') == '\u00C4');
8469     assert(compose('A', 'B') == dchar.init);
8470     assert(compose('C', '\u0301') == '\u0106');
8471     // note that the starter is the first one
8472     // thus the following doesn't compose
8473     assert(compose('\u0308', 'A') == dchar.init);
8474 }
8475 
8476 /++
8477     Returns a full $(S_LINK Canonical decomposition, Canonical)
8478     (by default) or $(S_LINK Compatibility decomposition, Compatibility)
8479     decomposition of $(CHARACTER) `ch`.
8480     If no decomposition is available returns a $(LREF Grapheme)
8481     with the `ch` itself.
8482 
8483     Note:
8484     This function also decomposes hangul syllables
8485     as prescribed by the standard.
8486 
8487     See_Also: $(LREF decomposeHangul) for a restricted version
8488     that takes into account only hangul syllables  but
8489     no other decompositions.
8490 +/
8491 public Grapheme decompose(UnicodeDecomposition decompType=Canonical)(dchar ch) @safe
8492 {
8493     import std.algorithm.searching : until;
8494     import std.internal.unicode_decomp : decompCompatTable, decompCanonTable;
8495     static if (decompType == Canonical)
8496     {
8497         alias table = decompCanonTable;
8498         alias mapping = canonMappingTrie;
8499     }
8500     else static if (decompType == Compatibility)
8501     {
8502         alias table = decompCompatTable;
8503         alias mapping = compatMappingTrie;
8504     }
8505     immutable idx = mapping[ch];
8506     if (!idx) // not found, check hangul arithmetic decomposition
8507         return decomposeHangul(ch);
8508     auto decomp = table[idx..$].until(0);
8509     return Grapheme(decomp);
8510 }
8511 
8512 ///
8513 @safe unittest
8514 {
8515     import std.algorithm.comparison : equal;
8516 
8517     assert(compose('A','\u0308') == '\u00C4');
8518     assert(compose('A', 'B') == dchar.init);
8519     assert(compose('C', '\u0301') == '\u0106');
8520     // note that the starter is the first one
8521     // thus the following doesn't compose
8522     assert(compose('\u0308', 'A') == dchar.init);
8523 
8524     assert(decompose('Ĉ')[].equal("C\u0302"));
8525     assert(decompose('D')[].equal("D"));
8526     assert(decompose('\uD4DC')[].equal("\u1111\u1171\u11B7"));
8527     assert(decompose!Compatibility('¹')[].equal("1"));
8528 }
8529 
8530 //----------------------------------------------------------------------------
8531 // Hangul specific composition/decomposition
8532 enum jamoSBase = 0xAC00;
8533 enum jamoLBase = 0x1100;
8534 enum jamoVBase = 0x1161;
8535 enum jamoTBase = 0x11A7;
8536 enum jamoLCount = 19, jamoVCount = 21, jamoTCount = 28;
8537 enum jamoNCount = jamoVCount * jamoTCount;
8538 enum jamoSCount = jamoLCount * jamoNCount;
8539 
8540 // Tests if `ch` is a Hangul leading consonant jamo.
8541 bool isJamoL(dchar ch) pure nothrow @nogc @safe
8542 {
8543     // first cmp rejects ~ 1M code points above leading jamo range
8544     return ch < jamoLBase+jamoLCount && ch >= jamoLBase;
8545 }
8546 
8547 // Tests if `ch` is a Hangul vowel jamo.
8548 bool isJamoT(dchar ch) pure nothrow @nogc @safe
8549 {
8550     // first cmp rejects ~ 1M code points above trailing jamo range
8551     // Note: ch == jamoTBase doesn't indicate trailing jamo (TIndex must be > 0)
8552     return ch < jamoTBase+jamoTCount && ch > jamoTBase;
8553 }
8554 
8555 // Tests if `ch` is a Hangul trailnig consonant jamo.
8556 bool isJamoV(dchar ch) pure nothrow @nogc @safe
8557 {
8558     // first cmp rejects ~ 1M code points above vowel range
8559     return  ch < jamoVBase+jamoVCount && ch >= jamoVBase;
8560 }
8561 
8562 int hangulSyllableIndex(dchar ch) pure nothrow @nogc @safe
8563 {
8564     int idxS = cast(int) ch - jamoSBase;
8565     return idxS >= 0 && idxS < jamoSCount ? idxS : -1;
8566 }
8567 
8568 // internal helper: compose hangul syllables leaving dchar.init in holes
8569 void hangulRecompose(scope dchar[] seq) pure nothrow @nogc @safe
8570 {
8571     for (size_t idx = 0; idx + 1 < seq.length; )
8572     {
8573         if (isJamoL(seq[idx]) && isJamoV(seq[idx+1]))
8574         {
8575             immutable int indexL = seq[idx] - jamoLBase;
8576             immutable int indexV = seq[idx+1] - jamoVBase;
8577             immutable int indexLV = indexL * jamoNCount + indexV * jamoTCount;
8578             if (idx + 2 < seq.length && isJamoT(seq[idx+2]))
8579             {
8580                 seq[idx] = jamoSBase + indexLV + seq[idx+2] - jamoTBase;
8581                 seq[idx+1] = dchar.init;
8582                 seq[idx+2] = dchar.init;
8583                 idx += 3;
8584             }
8585             else
8586             {
8587                 seq[idx] = jamoSBase + indexLV;
8588                 seq[idx+1] = dchar.init;
8589                 idx += 2;
8590             }
8591         }
8592         else
8593             idx++;
8594     }
8595 }
8596 
8597 //----------------------------------------------------------------------------
8598 public:
8599 
8600 /**
8601     Decomposes a Hangul syllable. If `ch` is not a composed syllable
8602     then this function returns $(LREF Grapheme) containing only `ch` as is.
8603 */
8604 Grapheme decomposeHangul(dchar ch) nothrow pure @safe
8605 {
8606     immutable idxS = cast(int) ch - jamoSBase;
8607     if (idxS < 0 || idxS >= jamoSCount) return Grapheme(ch);
8608     immutable idxL = idxS / jamoNCount;
8609     immutable idxV = (idxS % jamoNCount) / jamoTCount;
8610     immutable idxT = idxS % jamoTCount;
8611 
8612     immutable partL = jamoLBase + idxL;
8613     immutable partV = jamoVBase + idxV;
8614     if (idxT > 0) // there is a trailling consonant (T); <L,V,T> decomposition
8615         return Grapheme(partL, partV, jamoTBase + idxT);
8616     else // <L, V> decomposition
8617         return Grapheme(partL, partV);
8618 }
8619 
8620 ///
8621 @safe unittest
8622 {
8623     import std.algorithm.comparison : equal;
8624     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8625 }
8626 
8627 /++
8628     Try to compose hangul syllable out of a leading consonant (`lead`),
8629     a `vowel` and optional `trailing` consonant jamos.
8630 
8631     On success returns the composed LV or LVT hangul syllable.
8632 
8633     If any of `lead` and `vowel` are not a valid hangul jamo
8634     of the respective $(CHARACTER) class returns dchar.init.
8635 +/
8636 dchar composeJamo(dchar lead, dchar vowel, dchar trailing=dchar.init) pure nothrow @nogc @safe
8637 {
8638     if (!isJamoL(lead))
8639         return dchar.init;
8640     immutable indexL = lead - jamoLBase;
8641     if (!isJamoV(vowel))
8642         return dchar.init;
8643     immutable indexV = vowel - jamoVBase;
8644     immutable indexLV = indexL * jamoNCount + indexV * jamoTCount;
8645     immutable dchar syllable = jamoSBase + indexLV;
8646     return isJamoT(trailing) ? syllable + (trailing - jamoTBase) : syllable;
8647 }
8648 
8649 ///
8650 @safe unittest
8651 {
8652     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8653     // leaving out T-vowel, or passing any codepoint
8654     // that is not trailing consonant composes an LV-syllable
8655     assert(composeJamo('\u1111', '\u1171') == '\uD4CC');
8656     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8657     assert(composeJamo('\u1111', 'A') == dchar.init);
8658     assert(composeJamo('A', '\u1171') == dchar.init);
8659 }
8660 
8661 @safe unittest
8662 {
8663     import std.algorithm.comparison : equal;
8664     import std.conv : text;
8665 
8666     static void testDecomp(UnicodeDecomposition T)(dchar ch, string r)
8667     {
8668         Grapheme g = decompose!T(ch);
8669         assert(equal(g[], r), text(g[], " vs ", r));
8670     }
8671     testDecomp!Canonical('\u1FF4', "\u03C9\u0301\u0345");
8672     testDecomp!Canonical('\uF907', "\u9F9C");
8673     testDecomp!Compatibility('\u33FF', "\u0067\u0061\u006C");
8674     testDecomp!Compatibility('\uA7F9', "\u0153");
8675 
8676     // check examples
8677     assert(decomposeHangul('\uD4DB')[].equal("\u1111\u1171\u11B6"));
8678     assert(composeJamo('\u1111', '\u1171', '\u11B6') == '\uD4DB');
8679     assert(composeJamo('\u1111', '\u1171') == '\uD4CC'); // leave out T-vowel
8680     assert(composeJamo('\u1111', '\u1171', ' ') == '\uD4CC');
8681     assert(composeJamo('\u1111', 'A') == dchar.init);
8682     assert(composeJamo('A', '\u1171') == dchar.init);
8683 }
8684 
8685 /**
8686     Enumeration type for normalization forms,
8687     passed as template parameter for functions like $(LREF normalize).
8688 */
8689 enum NormalizationForm {
8690     NFC,
8691     NFD,
8692     NFKC,
8693     NFKD
8694 }
8695 
8696 
8697 enum {
8698     /**
8699         Shorthand aliases from values indicating normalization forms.
8700     */
8701     NFC = NormalizationForm.NFC,
8702     ///ditto
8703     NFD = NormalizationForm.NFD,
8704     ///ditto
8705     NFKC = NormalizationForm.NFKC,
8706     ///ditto
8707     NFKD = NormalizationForm.NFKD
8708 }
8709 
8710 /++
8711     Returns `input` string normalized to the chosen form.
8712     Form C is used by default.
8713 
8714     For more information on normalization forms see
8715     the $(S_LINK Normalization, normalization section).
8716 
8717     Note:
8718     In cases where the string in question is already normalized,
8719     it is returned unmodified and no memory allocation happens.
8720 +/
8721 /*
8722     WARNING: @trusted lambda inside - handle with same care as @trusted
8723         functions
8724 
8725     Despite being a template, the attributes do no harm since this doesn't work
8726     with user-defined range or character types anyway.
8727 */
8728 pure @safe inout(C)[] normalize(NormalizationForm norm=NFC, C)
8729     (return scope inout(C)[] input)
8730 {
8731     import std.algorithm.mutation : SwapStrategy;
8732     import std.algorithm.sorting : sort;
8733     import std.array : appender;
8734     import std.range : zip;
8735 
8736     auto anchors = splitNormalized!norm(input);
8737     if (anchors[0] == input.length && anchors[1] == input.length)
8738         return input;
8739     dchar[] decomposed;
8740     decomposed.reserve(31);
8741     ubyte[] ccc;
8742     ccc.reserve(31);
8743     auto app = appender!(C[])();
8744     do
8745     {
8746         app.put(input[0 .. anchors[0]]);
8747         foreach (dchar ch; input[anchors[0]..anchors[1]])
8748             static if (norm == NFD || norm == NFC)
8749             {
8750                 foreach (dchar c; decompose!Canonical(ch)[])
8751                     decomposed ~= c;
8752             }
8753             else // NFKD & NFKC
8754             {
8755                 foreach (dchar c; decompose!Compatibility(ch)[])
8756                     decomposed ~= c;
8757             }
8758         ccc.length = decomposed.length;
8759         size_t firstNonStable = 0;
8760         ubyte lastClazz = 0;
8761 
8762         foreach (idx, dchar ch; decomposed)
8763         {
8764             immutable clazz = combiningClass(ch);
8765             ccc[idx] = clazz;
8766             if (clazz == 0 && lastClazz != 0)
8767             {
8768                 // found a stable code point after unstable ones
8769                 sort!("a[0] < b[0]", SwapStrategy.stable)
8770                     (zip(ccc[firstNonStable .. idx], decomposed[firstNonStable .. idx]));
8771                 firstNonStable = decomposed.length;
8772             }
8773             else if (clazz != 0 && lastClazz == 0)
8774             {
8775                 // found first unstable code point after stable ones
8776                 firstNonStable = idx;
8777             }
8778             lastClazz = clazz;
8779         }
8780         sort!("a[0] < b[0]", SwapStrategy.stable)
8781             (zip(ccc[firstNonStable..$], decomposed[firstNonStable..$]));
8782         static if (norm == NFC || norm == NFKC)
8783         {
8784             import std.algorithm.searching : countUntil;
8785             auto first = countUntil(ccc, 0);
8786             if (first >= 0) // no starters?? no recomposition
8787             {
8788                 for (;;)
8789                 {
8790                     immutable second = recompose(first, decomposed, ccc);
8791                     if (second == decomposed.length)
8792                         break;
8793                     first = second;
8794                 }
8795                 // 2nd pass for hangul syllables
8796                 hangulRecompose(decomposed);
8797             }
8798         }
8799         static if (norm == NFD || norm == NFKD)
8800             app.put(decomposed);
8801         else
8802         {
8803             import std.algorithm.mutation : remove;
8804             auto clean = remove!("a == dchar.init", SwapStrategy.stable)(decomposed);
8805             app.put(decomposed[0 .. clean.length]);
8806         }
8807         // reset variables
8808         decomposed.length = 0;
8809         () @trusted {
8810             // assumeSafeAppend isn't considered pure as of writing, hence the
8811             // cast. It isn't pure in the sense that the elements after
8812             // the array in question are affected, but we don't use those
8813             // making the call pure for our purposes.
8814             (cast(void delegate() pure nothrow) {decomposed.assumeSafeAppend();})();
8815             ccc.length = 0;
8816             (cast(void delegate() pure nothrow) {ccc.assumeSafeAppend();})();
8817         } ();
8818         input = input[anchors[1]..$];
8819         // and move on
8820         anchors = splitNormalized!norm(input);
8821     } while (anchors[0] != input.length);
8822     app.put(input[0 .. anchors[0]]);
8823     return () @trusted inout { return cast(inout(C)[]) app.data; } ();
8824 }
8825 
8826 ///
8827 @safe pure unittest
8828 {
8829     // any encoding works
8830     wstring greet = "Hello world";
8831     assert(normalize(greet) is greet); // the same exact slice
8832 
8833     // An example of a character with all 4 forms being different:
8834     // Greek upsilon with acute and hook symbol (code point 0x03D3)
8835     assert(normalize!NFC("ϓ") == "\u03D3");
8836     assert(normalize!NFD("ϓ") == "\u03D2\u0301");
8837     assert(normalize!NFKC("ϓ") == "\u038E");
8838     assert(normalize!NFKD("ϓ") == "\u03A5\u0301");
8839 }
8840 
8841 @safe pure unittest
8842 {
8843     import std.conv : text;
8844 
8845     assert(normalize!NFD("abc\uF904def") == "abc\u6ED1def", text(normalize!NFD("abc\uF904def")));
8846     assert(normalize!NFKD("2¹⁰") == "210", normalize!NFKD("2¹⁰"));
8847     assert(normalize!NFD("Äffin") == "A\u0308ffin");
8848 
8849     // test with dstring
8850     dstring greet = "Hello world";
8851     assert(normalize(greet) is greet); // the same exact slice
8852 }
8853 
8854 // canonically recompose given slice of code points, works in-place and mutates data
8855 private size_t recompose(size_t start, scope dchar[] input, scope ubyte[] ccc) pure nothrow @safe
8856 {
8857     assert(input.length == ccc.length);
8858     int accumCC = -1;// so that it's out of 0 .. 255 range
8859     // writefln("recomposing %( %04x %)", input);
8860     // first one is always a starter thus we start at i == 1
8861     size_t i = start+1;
8862     for (; ; )
8863     {
8864         if (i == input.length)
8865             break;
8866         immutable curCC = ccc[i];
8867         // In any character sequence beginning with a starter S
8868         // a character C is blocked from S if and only if there
8869         // is some character B between S and C, and either B
8870         // is a starter or it has the same or higher combining class as C.
8871         //------------------------
8872         // Applying to our case:
8873         // S is input[0]
8874         // accumCC is the maximum CCC of characters between C and S,
8875         //     as ccc are sorted
8876         // C is input[i]
8877 
8878         if (curCC > accumCC)
8879         {
8880             immutable comp = compose(input[start], input[i]);
8881             if (comp != dchar.init)
8882             {
8883                 input[start] = comp;
8884                 input[i] = dchar.init;// put a sentinel
8885                 // current was merged so its CCC shouldn't affect
8886                 // composing with the next one
8887             }
8888             else
8889             {
8890                 // if it was a starter then accumCC is now 0, end of loop
8891                 accumCC = curCC;
8892                 if (accumCC == 0)
8893                     break;
8894             }
8895         }
8896         else
8897         {
8898             // ditto here
8899             accumCC = curCC;
8900             if (accumCC == 0)
8901                 break;
8902         }
8903         i++;
8904     }
8905     return i;
8906 }
8907 
8908 // returns tuple of 2 indexes that delimit:
8909 // normalized text, piece that needs normalization and
8910 // the rest of input starting with stable code point
8911 private auto splitNormalized(NormalizationForm norm, C)(scope const(C)[] input)
8912 {
8913     import std.typecons : tuple;
8914     ubyte lastCC = 0;
8915 
8916     foreach (idx, dchar ch; input)
8917     {
8918         static if (norm == NFC)
8919             if (ch < 0x0300)
8920             {
8921                 lastCC = 0;
8922                 continue;
8923             }
8924         immutable ubyte CC = combiningClass(ch);
8925         if (lastCC > CC && CC != 0)
8926         {
8927             return seekStable!norm(idx, input);
8928         }
8929 
8930         if (notAllowedIn!norm(ch))
8931         {
8932            return seekStable!norm(idx, input);
8933         }
8934         lastCC = CC;
8935     }
8936     return tuple(input.length, input.length);
8937 }
8938 
8939 private auto seekStable(NormalizationForm norm, C)(size_t idx, const scope C[] input)
8940 {
8941     import std.typecons : tuple;
8942     import std.utf : codeLength;
8943 
8944     auto br = input[0 .. idx];
8945     size_t region_start = 0;// default
8946     for (;;)
8947     {
8948         if (br.empty)// start is 0
8949             break;
8950         dchar ch = br.back;
8951         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8952         {
8953             region_start = br.length - codeLength!C(ch);
8954             break;
8955         }
8956         br.popFront();
8957     }
8958     ///@@@BUG@@@ can't use find: " find is a nested function and can't be used..."
8959     size_t region_end=input.length;// end is $ by default
8960     foreach (i, dchar ch; input[idx..$])
8961     {
8962         if (combiningClass(ch) == 0 && allowedIn!norm(ch))
8963         {
8964             region_end = i+idx;
8965             break;
8966         }
8967     }
8968     // writeln("Region to normalize: ", input[region_start .. region_end]);
8969     return tuple(region_start, region_end);
8970 }
8971 
8972 /**
8973     Tests if dchar `ch` is always allowed (Quick_Check=YES) in normalization
8974     form `norm`.
8975 */
8976 public bool allowedIn(NormalizationForm norm)(dchar ch)
8977 {
8978     return !notAllowedIn!norm(ch);
8979 }
8980 
8981 ///
8982 @safe unittest
8983 {
8984     // e.g. Cyrillic is always allowed, so is ASCII
8985     assert(allowedIn!NFC('я'));
8986     assert(allowedIn!NFD('я'));
8987     assert(allowedIn!NFKC('я'));
8988     assert(allowedIn!NFKD('я'));
8989     assert(allowedIn!NFC('Z'));
8990 }
8991 
8992 // not user friendly name but more direct
8993 private bool notAllowedIn(NormalizationForm norm)(dchar ch)
8994 {
8995     static if (norm == NFC)
8996         alias qcTrie = nfcQCTrie;
8997     else static if (norm == NFD)
8998         alias qcTrie = nfdQCTrie;
8999     else static if (norm == NFKC)
9000         alias qcTrie = nfkcQCTrie;
9001     else static if (norm == NFKD)
9002         alias qcTrie = nfkdQCTrie;
9003     else
9004         static assert("Unknown normalization form "~norm);
9005     return qcTrie[ch];
9006 }
9007 
9008 @safe unittest
9009 {
9010     assert(allowedIn!NFC('я'));
9011     assert(allowedIn!NFD('я'));
9012     assert(allowedIn!NFKC('я'));
9013     assert(allowedIn!NFKD('я'));
9014     assert(allowedIn!NFC('Z'));
9015 }
9016 
9017 }
9018 
9019 version (std_uni_bootstrap)
9020 {
9021     // old version used for bootstrapping of gen_uni.d that generates
9022     // up to date optimal versions of all of isXXX functions
9023     @safe pure nothrow @nogc public bool isWhite(dchar c)
9024     {
9025         import std.ascii : isWhite;
9026         return isWhite(c) ||
9027                c == lineSep || c == paraSep ||
9028                c == '\u0085' || c == '\u00A0' || c == '\u1680' || c == '\u180E' ||
9029                (c >= '\u2000' && c <= '\u200A') ||
9030                c == '\u202F' || c == '\u205F' || c == '\u3000';
9031     }
9032 }
9033 else
9034 {
9035 
9036 // trusted -> avoid bounds check
9037 @trusted pure nothrow @nogc private
9038 {
9039     import std.internal.unicode_tables; // : toLowerTable, toTitleTable, toUpperTable; // generated file
9040 
9041     // hide template instances behind functions
9042     // https://issues.dlang.org/show_bug.cgi?id=13232
9043     ushort toLowerIndex(dchar c) { return toLowerIndexTrie[c]; }
9044     ushort toLowerSimpleIndex(dchar c) { return toLowerSimpleIndexTrie[c]; }
9045     dchar toLowerTab(size_t idx) { return toLowerTable[idx]; }
9046 
9047     ushort toTitleIndex(dchar c) { return toTitleIndexTrie[c]; }
9048     ushort toTitleSimpleIndex(dchar c) { return toTitleSimpleIndexTrie[c]; }
9049     dchar toTitleTab(size_t idx) { return toTitleTable[idx]; }
9050 
9051     ushort toUpperIndex(dchar c) { return toUpperIndexTrie[c]; }
9052     ushort toUpperSimpleIndex(dchar c) { return toUpperSimpleIndexTrie[c]; }
9053     dchar toUpperTab(size_t idx) { return toUpperTable[idx]; }
9054 }
9055 
9056 public:
9057 
9058 /++
9059     Whether or not `c` is a Unicode whitespace $(CHARACTER).
9060     (general Unicode category: Part of C0(tab, vertical tab, form feed,
9061     carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085))
9062 +/
9063 @safe pure nothrow @nogc
9064 public bool isWhite(dchar c)
9065 {
9066     import std.internal.unicode_tables : isWhiteGen; // generated file
9067     return isWhiteGen(c); // call pregenerated binary search
9068 }
9069 
9070 /++
9071     Return whether `c` is a Unicode lowercase $(CHARACTER).
9072 +/
9073 @safe pure nothrow @nogc
9074 bool isLower(dchar c)
9075 {
9076     import std.ascii : isLower, isASCII;
9077     if (isASCII(c))
9078         return isLower(c);
9079     return lowerCaseTrie[c];
9080 }
9081 
9082 @safe unittest
9083 {
9084     import std.ascii : isLower;
9085     foreach (v; 0 .. 0x80)
9086         assert(isLower(v) == .isLower(v));
9087     assert(.isLower('я'));
9088     assert(.isLower('й'));
9089     assert(!.isLower('Ж'));
9090     // Greek HETA
9091     assert(!.isLower('\u0370'));
9092     assert(.isLower('\u0371'));
9093     assert(!.isLower('\u039C')); // capital MU
9094     assert(.isLower('\u03B2')); // beta
9095     // from extended Greek
9096     assert(!.isLower('\u1F18'));
9097     assert(.isLower('\u1F00'));
9098     foreach (v; unicode.lowerCase.byCodepoint)
9099         assert(.isLower(v) && !isUpper(v));
9100 }
9101 
9102 
9103 /++
9104     Return whether `c` is a Unicode uppercase $(CHARACTER).
9105 +/
9106 @safe pure nothrow @nogc
9107 bool isUpper(dchar c)
9108 {
9109     import std.ascii : isUpper, isASCII;
9110     if (isASCII(c))
9111         return isUpper(c);
9112     return upperCaseTrie[c];
9113 }
9114 
9115 @safe unittest
9116 {
9117     import std.ascii : isLower;
9118     foreach (v; 0 .. 0x80)
9119         assert(isLower(v) == .isLower(v));
9120     assert(!isUpper('й'));
9121     assert(isUpper('Ж'));
9122     // Greek HETA
9123     assert(isUpper('\u0370'));
9124     assert(!isUpper('\u0371'));
9125     assert(isUpper('\u039C')); // capital MU
9126     assert(!isUpper('\u03B2')); // beta
9127     // from extended Greek
9128     assert(!isUpper('\u1F00'));
9129     assert(isUpper('\u1F18'));
9130     foreach (v; unicode.upperCase.byCodepoint)
9131         assert(isUpper(v) && !.isLower(v));
9132 }
9133 
9134 
9135 //TODO: Hidden for now, needs better API.
9136 //Other transforms could use better API as well, but this one is a new primitive.
9137 @safe pure nothrow @nogc
9138 private dchar toTitlecase(dchar c)
9139 {
9140     // optimize ASCII case
9141     if (c < 0xAA)
9142     {
9143         if (c < 'a')
9144             return c;
9145         if (c <= 'z')
9146             return c - 32;
9147         return c;
9148     }
9149     size_t idx = toTitleSimpleIndex(c);
9150     if (idx != ushort.max)
9151     {
9152         return toTitleTab(idx);
9153     }
9154     return c;
9155 }
9156 
9157 private alias UpperTriple = AliasSeq!(toUpperIndex, MAX_SIMPLE_UPPER, toUpperTab);
9158 private alias LowerTriple = AliasSeq!(toLowerIndex, MAX_SIMPLE_LOWER, toLowerTab);
9159 
9160 // generic toUpper/toLower on whole string, creates new or returns as is
9161 private ElementEncodingType!S[] toCase(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, S)(S s)
9162 if (isSomeString!S || (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
9163 {
9164     import std.array : appender, array;
9165     import std.ascii : isASCII;
9166     import std.utf : byDchar, codeLength;
9167 
9168     alias C = ElementEncodingType!S;
9169 
9170     auto r = s.byDchar;
9171     for (size_t i; !r.empty; i += r.front.codeLength!C , r.popFront())
9172     {
9173         auto cOuter = r.front;
9174         ushort idx = indexFn(cOuter);
9175         if (idx == ushort.max)
9176             continue;
9177         auto result = appender!(C[])();
9178         result.reserve(s.length);
9179         result.put(s[0 .. i]);
9180         foreach (dchar c; s[i .. $].byDchar)
9181         {
9182             if (c.isASCII)
9183             {
9184                 result.put(asciiConvert(c));
9185             }
9186             else
9187             {
9188                 idx = indexFn(c);
9189                 if (idx == ushort.max)
9190                     result.put(c);
9191                 else if (idx < maxIdx)
9192                 {
9193                     c = tableFn(idx);
9194                     result.put(c);
9195                 }
9196                 else
9197                 {
9198                     auto val = tableFn(idx);
9199                     // unpack length + codepoint
9200                     immutable uint len = val >> 24;
9201                     result.put(cast(dchar)(val & 0xFF_FFFF));
9202                     foreach (j; idx+1 .. idx+len)
9203                         result.put(tableFn(j));
9204                 }
9205             }
9206         }
9207         return result.data;
9208     }
9209 
9210     static if (isSomeString!S)
9211         return s;
9212     else
9213         return s.array;
9214 }
9215 
9216 // https://issues.dlang.org/show_bug.cgi?id=12428
9217 @safe unittest
9218 {
9219     import std.array : replicate;
9220     auto s = "abcdefghij".replicate(300);
9221     s = s[0 .. 10];
9222 
9223     toUpper(s);
9224 
9225     assert(s == "abcdefghij");
9226 }
9227 
9228 // https://issues.dlang.org/show_bug.cgi?id=18993
9229 @safe unittest
9230 {
9231     static assert(`몬스터/A`.toLower.length == `몬스터/a`.toLower.length);
9232 }
9233 
9234 
9235 // generic toUpper/toLower on whole range, returns range
9236 private auto toCaser(alias indexFn, uint maxIdx, alias tableFn, alias asciiConvert, Range)(Range str)
9237     // Accept range of dchar's
9238 if (isInputRange!Range &&
9239     isSomeChar!(ElementEncodingType!Range) &&
9240     ElementEncodingType!Range.sizeof == dchar.sizeof)
9241 {
9242     static struct ToCaserImpl
9243     {
9244         @property bool empty()
9245         {
9246             return !nLeft && r.empty;
9247         }
9248 
9249         @property auto front()
9250         {
9251             import std.ascii : isASCII;
9252 
9253             if (!nLeft)
9254             {
9255                 dchar c = r.front;
9256                 if (c.isASCII)
9257                 {
9258                     buf[0] = asciiConvert(c);
9259                     nLeft = 1;
9260                 }
9261                 else
9262                 {
9263                     const idx = indexFn(c);
9264                     if (idx == ushort.max)
9265                     {
9266                         buf[0] = c;
9267                         nLeft = 1;
9268                     }
9269                     else if (idx < maxIdx)
9270                     {
9271                         buf[0] = tableFn(idx);
9272                         nLeft = 1;
9273                     }
9274                     else
9275                     {
9276                         immutable val = tableFn(idx);
9277                         // unpack length + codepoint
9278                         nLeft = val >> 24;
9279                         if (nLeft == 0)
9280                             nLeft = 1;
9281                         assert(nLeft <= buf.length);
9282                         buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9283                         foreach (j; 1 .. nLeft)
9284                             buf[nLeft - j - 1] = tableFn(idx + j);
9285                     }
9286                 }
9287             }
9288             return buf[nLeft - 1];
9289         }
9290 
9291         void popFront()
9292         {
9293             if (!nLeft)
9294                 front;
9295             assert(nLeft);
9296             --nLeft;
9297             if (!nLeft)
9298                 r.popFront();
9299         }
9300 
9301         static if (isForwardRange!Range)
9302         {
9303             @property auto save()
9304             {
9305                 auto ret = this;
9306                 ret.r = r.save;
9307                 return ret;
9308             }
9309         }
9310 
9311       private:
9312         Range r;
9313         uint nLeft;
9314         dchar[3] buf = void;
9315     }
9316 
9317     return ToCaserImpl(str);
9318 }
9319 
9320 /*********************
9321  * Convert an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9322  * or a string to upper or lower case.
9323  *
9324  * Does not allocate memory.
9325  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9326  * are treated as $(REF replacementDchar, std,utf).
9327  *
9328  * Params:
9329  *      str = string or range of characters
9330  *
9331  * Returns:
9332  *      an input range of `dchar`s
9333  *
9334  * See_Also:
9335  *      $(LREF toUpper), $(LREF toLower)
9336  */
9337 
9338 auto asLowerCase(Range)(Range str)
9339 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9340     !isConvertibleToString!Range)
9341 {
9342     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9343     {
9344         import std.utf : byDchar;
9345 
9346         // Decode first
9347         return asLowerCase(str.byDchar);
9348     }
9349     else
9350     {
9351         static import std.ascii;
9352         return toCaser!(LowerTriple, std.ascii.toLower)(str);
9353     }
9354 }
9355 
9356 /// ditto
9357 auto asUpperCase(Range)(Range str)
9358 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9359     !isConvertibleToString!Range)
9360 {
9361     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9362     {
9363         import std.utf : byDchar;
9364 
9365         // Decode first
9366         return asUpperCase(str.byDchar);
9367     }
9368     else
9369     {
9370         static import std.ascii;
9371         return toCaser!(UpperTriple, std.ascii.toUpper)(str);
9372     }
9373 }
9374 
9375 ///
9376 @safe pure unittest
9377 {
9378     import std.algorithm.comparison : equal;
9379 
9380     assert("hEllo".asUpperCase.equal("HELLO"));
9381 }
9382 
9383 // explicitly undocumented
9384 auto asLowerCase(Range)(auto ref Range str)
9385 if (isConvertibleToString!Range)
9386 {
9387     import std.traits : StringTypeOf;
9388     return asLowerCase!(StringTypeOf!Range)(str);
9389 }
9390 
9391 // explicitly undocumented
9392 auto asUpperCase(Range)(auto ref Range str)
9393 if (isConvertibleToString!Range)
9394 {
9395     import std.traits : StringTypeOf;
9396     return asUpperCase!(StringTypeOf!Range)(str);
9397 }
9398 
9399 @safe unittest
9400 {
9401     static struct TestAliasedString
9402     {
9403         string get() @safe @nogc pure nothrow { return _s; }
9404         alias get this;
9405         @disable this(this);
9406         string _s;
9407     }
9408 
9409     static bool testAliasedString(alias func, Args...)(string s, Args args)
9410     {
9411         import std.algorithm.comparison : equal;
9412         auto a = func(TestAliasedString(s), args);
9413         auto b = func(s, args);
9414         static if (is(typeof(equal(a, b))))
9415         {
9416             // For ranges, compare contents instead of object identity.
9417             return equal(a, b);
9418         }
9419         else
9420         {
9421             return a == b;
9422         }
9423     }
9424     assert(testAliasedString!asLowerCase("hEllo"));
9425     assert(testAliasedString!asUpperCase("hEllo"));
9426     assert(testAliasedString!asCapitalized("hEllo"));
9427 }
9428 
9429 @safe unittest
9430 {
9431     import std.array : array;
9432 
9433     auto a = "HELLo".asLowerCase;
9434     auto savea = a.save;
9435     auto s = a.array;
9436     assert(s == "hello");
9437     s = savea.array;
9438     assert(s == "hello");
9439 
9440     string[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
9441     string[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
9442 
9443     foreach (i, slwr; lower)
9444     {
9445         import std.utf : byChar;
9446 
9447         auto sx = slwr.asUpperCase.byChar.array;
9448         assert(sx == toUpper(slwr));
9449         auto sy = upper[i].asLowerCase.byChar.array;
9450         assert(sy == toLower(upper[i]));
9451     }
9452 
9453     // Not necessary to call r.front
9454     for (auto r = lower[3].asUpperCase; !r.empty; r.popFront())
9455     {
9456     }
9457 
9458     import std.algorithm.comparison : equal;
9459 
9460     "HELLo"w.asLowerCase.equal("hello"d);
9461     "HELLo"w.asUpperCase.equal("HELLO"d);
9462     "HELLo"d.asLowerCase.equal("hello"d);
9463     "HELLo"d.asUpperCase.equal("HELLO"d);
9464 
9465     import std.utf : byChar;
9466     assert(toLower("\u1Fe2") == asLowerCase("\u1Fe2").byChar.array);
9467 }
9468 
9469 // generic capitalizer on whole range, returns range
9470 private auto toCapitalizer(alias indexFnUpper, uint maxIdxUpper, alias tableFnUpper,
9471                            Range)(Range str)
9472     // Accept range of dchar's
9473 if (isInputRange!Range &&
9474     isSomeChar!(ElementEncodingType!Range) &&
9475     ElementEncodingType!Range.sizeof == dchar.sizeof)
9476 {
9477     static struct ToCapitalizerImpl
9478     {
9479         @property bool empty()
9480         {
9481             return lower ? lwr.empty : !nLeft && r.empty;
9482         }
9483 
9484         @property auto front()
9485         {
9486             if (lower)
9487                 return lwr.front;
9488 
9489             if (!nLeft)
9490             {
9491                 immutable dchar c = r.front;
9492                 const idx = indexFnUpper(c);
9493                 if (idx == ushort.max)
9494                 {
9495                     buf[0] = c;
9496                     nLeft = 1;
9497                 }
9498                 else if (idx < maxIdxUpper)
9499                 {
9500                     buf[0] = tableFnUpper(idx);
9501                     nLeft = 1;
9502                 }
9503                 else
9504                 {
9505                     immutable val = tableFnUpper(idx);
9506                     // unpack length + codepoint
9507                     nLeft = val >> 24;
9508                     if (nLeft == 0)
9509                         nLeft = 1;
9510                     assert(nLeft <= buf.length);
9511                     buf[nLeft - 1] = cast(dchar)(val & 0xFF_FFFF);
9512                     foreach (j; 1 .. nLeft)
9513                         buf[nLeft - j - 1] = tableFnUpper(idx + j);
9514                 }
9515             }
9516             return buf[nLeft - 1];
9517         }
9518 
9519         void popFront()
9520         {
9521             if (lower)
9522                 lwr.popFront();
9523             else
9524             {
9525                 if (!nLeft)
9526                     front;
9527                 assert(nLeft);
9528                 --nLeft;
9529                 if (!nLeft)
9530                 {
9531                     r.popFront();
9532                     lwr = r.asLowerCase();
9533                     lower = true;
9534                 }
9535             }
9536         }
9537 
9538         static if (isForwardRange!Range)
9539         {
9540             @property auto save()
9541             {
9542                 auto ret = this;
9543                 ret.r = r.save;
9544                 ret.lwr = lwr.save;
9545                 return ret;
9546             }
9547         }
9548 
9549       private:
9550         Range r;
9551         typeof(r.asLowerCase) lwr; // range representing the lower case rest of string
9552         bool lower = false;     // false for first character, true for rest of string
9553         dchar[3] buf = void;
9554         uint nLeft = 0;
9555     }
9556 
9557     return ToCapitalizerImpl(str);
9558 }
9559 
9560 /*********************
9561  * Capitalize an $(REF_ALTTEXT input range, isInputRange, std,range,primitives)
9562  * or string, meaning convert the first
9563  * character to upper case and subsequent characters to lower case.
9564  *
9565  * Does not allocate memory.
9566  * Characters in UTF-8 or UTF-16 format that cannot be decoded
9567  * are treated as $(REF replacementDchar, std,utf).
9568  *
9569  * Params:
9570  *      str = string or range of characters
9571  *
9572  * Returns:
9573  *      an InputRange of dchars
9574  *
9575  * See_Also:
9576  *      $(LREF toUpper), $(LREF toLower)
9577  *      $(LREF asUpperCase), $(LREF asLowerCase)
9578  */
9579 
9580 auto asCapitalized(Range)(Range str)
9581 if (isInputRange!Range && isSomeChar!(ElementEncodingType!Range) &&
9582     !isConvertibleToString!Range)
9583 {
9584     static if (ElementEncodingType!Range.sizeof < dchar.sizeof)
9585     {
9586         import std.utf : byDchar;
9587 
9588         // Decode first
9589         return toCapitalizer!UpperTriple(str.byDchar);
9590     }
9591     else
9592     {
9593         return toCapitalizer!UpperTriple(str);
9594     }
9595 }
9596 
9597 ///
9598 @safe pure unittest
9599 {
9600     import std.algorithm.comparison : equal;
9601 
9602     assert("hEllo".asCapitalized.equal("Hello"));
9603 }
9604 
9605 auto asCapitalized(Range)(auto ref Range str)
9606 if (isConvertibleToString!Range)
9607 {
9608     import std.traits : StringTypeOf;
9609     return asCapitalized!(StringTypeOf!Range)(str);
9610 }
9611 
9612 @safe pure nothrow @nogc unittest
9613 {
9614     auto r = "hEllo".asCapitalized();
9615     assert(r.front == 'H');
9616 }
9617 
9618 @safe unittest
9619 {
9620     import std.array : array;
9621 
9622     auto a = "hELLo".asCapitalized;
9623     auto savea = a.save;
9624     auto s = a.array;
9625     assert(s == "Hello");
9626     s = savea.array;
9627     assert(s == "Hello");
9628 
9629     string[2][] cases =
9630     [
9631         ["", ""],
9632         ["h", "H"],
9633         ["H", "H"],
9634         ["3", "3"],
9635         ["123", "123"],
9636         ["h123A", "H123a"],
9637         ["феж", "Феж"],
9638         ["\u1Fe2", "\u03a5\u0308\u0300"],
9639     ];
9640 
9641     foreach (i; 0 .. cases.length)
9642     {
9643         import std.utf : byChar;
9644 
9645         auto r = cases[i][0].asCapitalized.byChar.array;
9646         auto result = cases[i][1];
9647         assert(r == result);
9648     }
9649 
9650     // Don't call r.front
9651     for (auto r = "\u1Fe2".asCapitalized; !r.empty; r.popFront())
9652     {
9653     }
9654 
9655     import std.algorithm.comparison : equal;
9656 
9657     "HELLo"w.asCapitalized.equal("Hello"d);
9658     "hElLO"w.asCapitalized.equal("Hello"d);
9659     "hello"d.asCapitalized.equal("Hello"d);
9660     "HELLO"d.asCapitalized.equal("Hello"d);
9661 
9662     import std.utf : byChar;
9663     assert(asCapitalized("\u0130").byChar.array == asUpperCase("\u0130").byChar.array);
9664 }
9665 
9666 // TODO: helper, I wish std.utf was more flexible (and stright)
9667 private size_t encodeTo(scope char[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9668 {
9669     if (c <= 0x7F)
9670     {
9671         buf[idx] = cast(char) c;
9672         idx++;
9673     }
9674     else if (c <= 0x7FF)
9675     {
9676         buf[idx] = cast(char)(0xC0 | (c >> 6));
9677         buf[idx+1] = cast(char)(0x80 | (c & 0x3F));
9678         idx += 2;
9679     }
9680     else if (c <= 0xFFFF)
9681     {
9682         buf[idx] = cast(char)(0xE0 | (c >> 12));
9683         buf[idx+1] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9684         buf[idx+2] = cast(char)(0x80 | (c & 0x3F));
9685         idx += 3;
9686     }
9687     else if (c <= 0x10FFFF)
9688     {
9689         buf[idx] = cast(char)(0xF0 | (c >> 18));
9690         buf[idx+1] = cast(char)(0x80 | ((c >> 12) & 0x3F));
9691         buf[idx+2] = cast(char)(0x80 | ((c >> 6) & 0x3F));
9692         buf[idx+3] = cast(char)(0x80 | (c & 0x3F));
9693         idx += 4;
9694     }
9695     else
9696         assert(0);
9697     return idx;
9698 }
9699 
9700 @safe unittest
9701 {
9702     char[] s = "abcd".dup;
9703     size_t i = 0;
9704     i = encodeTo(s, i, 'X');
9705     assert(s == "Xbcd");
9706 
9707     i = encodeTo(s, i, cast(dchar)'\u00A9');
9708     assert(s == "X\xC2\xA9d");
9709 }
9710 
9711 // TODO: helper, I wish std.utf was more flexible (and stright)
9712 private size_t encodeTo(scope wchar[] buf, size_t idx, dchar c) @trusted pure
9713 {
9714     import std.utf : UTFException;
9715     if (c <= 0xFFFF)
9716     {
9717         if (0xD800 <= c && c <= 0xDFFF)
9718             throw (new UTFException("Encoding an isolated surrogate code point in UTF-16")).setSequence(c);
9719         buf[idx] = cast(wchar) c;
9720         idx++;
9721     }
9722     else if (c <= 0x10FFFF)
9723     {
9724         buf[idx] = cast(wchar)((((c - 0x10000) >> 10) & 0x3FF) + 0xD800);
9725         buf[idx+1] = cast(wchar)(((c - 0x10000) & 0x3FF) + 0xDC00);
9726         idx += 2;
9727     }
9728     else
9729         assert(0);
9730     return idx;
9731 }
9732 
9733 private size_t encodeTo(scope dchar[] buf, size_t idx, dchar c) @trusted pure nothrow @nogc
9734 {
9735     buf[idx] = c;
9736     idx++;
9737     return idx;
9738 }
9739 
9740 private void toCaseInPlace(alias indexFn, uint maxIdx, alias tableFn, C)(ref C[] s) @trusted pure
9741 if (is(C == char) || is(C == wchar)  || is(C == dchar))
9742 {
9743     import std.utf : decode, codeLength;
9744     size_t curIdx = 0;
9745     size_t destIdx = 0;
9746     alias slowToCase = toCaseInPlaceAlloc!(indexFn, maxIdx, tableFn);
9747     size_t lastUnchanged = 0;
9748     // in-buffer move of bytes to a new start index
9749     // the trick is that it may not need to copy at all
9750     static size_t moveTo(C[] str, size_t dest, size_t from, size_t to)
9751     {
9752         // Interestingly we may just bump pointer for a while
9753         // then have to copy if a re-cased char was smaller the original
9754         // later we may regain pace with char that got bigger
9755         // In the end it sometimes flip-flops between the 2 cases below
9756         if (dest == from)
9757             return to;
9758         // got to copy
9759         foreach (C c; str[from .. to])
9760             str[dest++] = c;
9761         return dest;
9762     }
9763     while (curIdx != s.length)
9764     {
9765         size_t startIdx = curIdx;
9766         immutable ch = decode(s, curIdx);
9767         // TODO: special case for ASCII
9768         immutable caseIndex = indexFn(ch);
9769         if (caseIndex == ushort.max) // unchanged, skip over
9770         {
9771             continue;
9772         }
9773         else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9774         {
9775             // previous cased chars had the same length as uncased ones
9776             // thus can just adjust pointer
9777             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9778             lastUnchanged = curIdx;
9779             immutable cased = tableFn(caseIndex);
9780             immutable casedLen = codeLength!C(cased);
9781             if (casedLen + destIdx > curIdx) // no place to fit cased char
9782             {
9783                 // switch to slow codepath, where we allocate
9784                 return slowToCase(s, startIdx, destIdx);
9785             }
9786             else
9787             {
9788                 destIdx = encodeTo(s, destIdx, cased);
9789             }
9790         }
9791         else  // 1:m codepoint mapping, slow codepath
9792         {
9793             destIdx = moveTo(s, destIdx, lastUnchanged, startIdx);
9794             lastUnchanged = curIdx;
9795             return slowToCase(s, startIdx, destIdx);
9796         }
9797         assert(destIdx <= curIdx);
9798     }
9799     if (lastUnchanged != s.length)
9800     {
9801         destIdx = moveTo(s, destIdx, lastUnchanged, s.length);
9802     }
9803     s = s[0 .. destIdx];
9804 }
9805 
9806 // helper to precalculate size of case-converted string
9807 private template toCaseLength(alias indexFn, uint maxIdx, alias tableFn)
9808 {
9809     size_t toCaseLength(C)(const scope C[] str)
9810     {
9811         import std.utf : decode, codeLength;
9812         size_t codeLen = 0;
9813         size_t lastNonTrivial = 0;
9814         size_t curIdx = 0;
9815         while (curIdx != str.length)
9816         {
9817             immutable startIdx = curIdx;
9818             immutable ch = decode(str, curIdx);
9819             immutable ushort caseIndex = indexFn(ch);
9820             if (caseIndex == ushort.max)
9821                 continue;
9822             else if (caseIndex < maxIdx)
9823             {
9824                 codeLen += startIdx - lastNonTrivial;
9825                 lastNonTrivial = curIdx;
9826                 immutable cased = tableFn(caseIndex);
9827                 codeLen += codeLength!C(cased);
9828             }
9829             else
9830             {
9831                 codeLen += startIdx - lastNonTrivial;
9832                 lastNonTrivial = curIdx;
9833                 immutable val = tableFn(caseIndex);
9834                 immutable len = val >> 24;
9835                 immutable dchar cased = val & 0xFF_FFFF;
9836                 codeLen += codeLength!C(cased);
9837                 foreach (j; caseIndex+1 .. caseIndex+len)
9838                     codeLen += codeLength!C(tableFn(j));
9839             }
9840         }
9841         if (lastNonTrivial != str.length)
9842             codeLen += str.length - lastNonTrivial;
9843         return codeLen;
9844     }
9845 }
9846 
9847 @safe unittest
9848 {
9849     alias toLowerLength = toCaseLength!(LowerTriple);
9850     assert(toLowerLength("abcd") == 4);
9851     assert(toLowerLength("аБВгд456") == 10+3);
9852 }
9853 
9854 // slower code path that preallocates and then copies
9855 // case-converted stuf to the new string
9856 private template toCaseInPlaceAlloc(alias indexFn, uint maxIdx, alias tableFn)
9857 {
9858     void toCaseInPlaceAlloc(C)(ref C[] s, size_t curIdx,
9859         size_t destIdx) @trusted pure
9860         if (is(C == char) || is(C == wchar) || is(C == dchar))
9861     {
9862         import std.utf : decode;
9863         alias caseLength = toCaseLength!(indexFn, maxIdx, tableFn);
9864         auto trueLength = destIdx + caseLength(s[curIdx..$]);
9865         C[] ns = new C[trueLength];
9866         ns[0 .. destIdx] = s[0 .. destIdx];
9867         size_t lastUnchanged = curIdx;
9868         while (curIdx != s.length)
9869         {
9870             immutable startIdx = curIdx; // start of current codepoint
9871             immutable ch = decode(s, curIdx);
9872             immutable caseIndex = indexFn(ch);
9873             if (caseIndex == ushort.max) // skip over
9874             {
9875                 continue;
9876             }
9877             else if (caseIndex < maxIdx)  // 1:1 codepoint mapping
9878             {
9879                 immutable cased = tableFn(caseIndex);
9880                 auto toCopy = startIdx - lastUnchanged;
9881                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9882                 lastUnchanged = curIdx;
9883                 destIdx += toCopy;
9884                 destIdx = encodeTo(ns, destIdx, cased);
9885             }
9886             else  // 1:m codepoint mapping, slow codepath
9887             {
9888                 auto toCopy = startIdx - lastUnchanged;
9889                 ns[destIdx .. destIdx+toCopy] = s[lastUnchanged .. startIdx];
9890                 lastUnchanged = curIdx;
9891                 destIdx += toCopy;
9892                 auto val = tableFn(caseIndex);
9893                 // unpack length + codepoint
9894                 immutable uint len = val >> 24;
9895                 destIdx = encodeTo(ns, destIdx, cast(dchar)(val & 0xFF_FFFF));
9896                 foreach (j; caseIndex+1 .. caseIndex+len)
9897                     destIdx = encodeTo(ns, destIdx, tableFn(j));
9898             }
9899         }
9900         if (lastUnchanged != s.length)
9901         {
9902             auto toCopy = s.length - lastUnchanged;
9903             ns[destIdx .. destIdx+toCopy] = s[lastUnchanged..$];
9904             destIdx += toCopy;
9905         }
9906         assert(ns.length == destIdx);
9907         s = ns;
9908     }
9909 }
9910 
9911 /++
9912     Converts `s` to lowercase (by performing Unicode lowercase mapping) in place.
9913     For a few characters string length may increase after the transformation,
9914     in such a case the function reallocates exactly once.
9915     If `s` does not have any uppercase characters, then `s` is unaltered.
9916 +/
9917 void toLowerInPlace(C)(ref C[] s) @trusted pure
9918 if (is(C == char) || is(C == wchar) || is(C == dchar))
9919 {
9920     toCaseInPlace!(LowerTriple)(s);
9921 }
9922 // overloads for the most common cases to reduce compile time
9923 @safe pure /*TODO nothrow*/
9924 {
9925     void toLowerInPlace(ref char[] s)
9926     { toLowerInPlace!char(s); }
9927     void toLowerInPlace(ref wchar[] s)
9928     { toLowerInPlace!wchar(s); }
9929     void toLowerInPlace(ref dchar[] s)
9930     { toLowerInPlace!dchar(s); }
9931 }
9932 
9933 /++
9934     Converts `s` to uppercase  (by performing Unicode uppercase mapping) in place.
9935     For a few characters string length may increase after the transformation,
9936     in such a case the function reallocates exactly once.
9937     If `s` does not have any lowercase characters, then `s` is unaltered.
9938 +/
9939 void toUpperInPlace(C)(ref C[] s) @trusted pure
9940 if (is(C == char) || is(C == wchar) || is(C == dchar))
9941 {
9942     toCaseInPlace!(UpperTriple)(s);
9943 }
9944 // overloads for the most common cases to reduce compile time/code size
9945 @safe pure /*TODO nothrow*/
9946 {
9947     void toUpperInPlace(ref char[] s)
9948     { toUpperInPlace!char(s); }
9949     void toUpperInPlace(ref wchar[] s)
9950     { toUpperInPlace!wchar(s); }
9951     void toUpperInPlace(ref dchar[] s)
9952     { toUpperInPlace!dchar(s); }
9953 }
9954 
9955 /++
9956     If `c` is a Unicode uppercase $(CHARACTER), then its lowercase equivalent
9957     is returned. Otherwise `c` is returned.
9958 
9959     Warning: certain alphabets like German and Greek have no 1:1
9960     upper-lower mapping. Use overload of toLower which takes full string instead.
9961 +/
9962 @safe pure nothrow @nogc
9963 dchar toLower(dchar c)
9964 {
9965      // optimize ASCII case
9966     if (c < 0xAA)
9967     {
9968         if (c < 'A')
9969             return c;
9970         if (c <= 'Z')
9971             return c + 32;
9972         return c;
9973     }
9974     size_t idx = toLowerSimpleIndex(c);
9975     if (idx != ushort.max)
9976     {
9977         return toLowerTab(idx);
9978     }
9979     return c;
9980 }
9981 
9982 /++
9983     Creates a new array which is identical to `s` except that all of its
9984     characters are converted to lowercase (by performing Unicode lowercase mapping).
9985     If none of `s` characters were affected, then `s` itself is returned if `s` is a
9986     `string`-like type.
9987 
9988     Params:
9989         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
9990         of characters
9991     Returns:
9992         An array with the same element type as `s`.
9993 +/
9994 ElementEncodingType!S[] toLower(S)(return scope S s) @trusted
9995 if (isSomeString!S)
9996 {
9997     static import std.ascii;
9998     return toCase!(LowerTriple, std.ascii.toLower)(s);
9999 }
10000 
10001 /// ditto
10002 ElementEncodingType!S[] toLower(S)(S s)
10003 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10004 {
10005     static import std.ascii;
10006     return toCase!(LowerTriple, std.ascii.toLower)(s);
10007 }
10008 
10009 // overloads for the most common cases to reduce compile time
10010 @safe pure /*TODO nothrow*/
10011 {
10012     string toLower(return scope string s)
10013     { return toLower!string(s); }
10014     wstring toLower(return scope wstring s)
10015     { return toLower!wstring(s); }
10016     dstring toLower(return scope dstring s)
10017     { return toLower!dstring(s); }
10018 
10019     @safe unittest
10020     {
10021         // https://issues.dlang.org/show_bug.cgi?id=16663
10022 
10023         static struct String
10024         {
10025             string data;
10026             alias data this;
10027         }
10028 
10029         void foo()
10030         {
10031             auto u = toLower(String(""));
10032         }
10033     }
10034 }
10035 
10036 
10037 @safe unittest
10038 {
10039     static import std.ascii;
10040     import std.format : format;
10041     foreach (ch; 0 .. 0x80)
10042         assert(std.ascii.toLower(ch) == toLower(ch));
10043     assert(toLower('Я') == 'я');
10044     assert(toLower('Δ') == 'δ');
10045     foreach (ch; unicode.upperCase.byCodepoint)
10046     {
10047         dchar low = ch.toLower();
10048         assert(low == ch || isLower(low), format("%s -> %s", ch, low));
10049     }
10050     assert(toLower("АЯ") == "ая");
10051 
10052     assert("\u1E9E".toLower == "\u00df");
10053     assert("\u00df".toUpper == "SS");
10054 }
10055 
10056 // https://issues.dlang.org/show_bug.cgi?id=9629
10057 @safe unittest
10058 {
10059     wchar[] test = "hello þ world"w.dup;
10060     auto piece = test[6 .. 7];
10061     toUpperInPlace(piece);
10062     assert(test == "hello Þ world");
10063 }
10064 
10065 
10066 @safe unittest
10067 {
10068     import std.algorithm.comparison : cmp;
10069     string s1 = "FoL";
10070     string s2 = toLower(s1);
10071     assert(cmp(s2, "fol") == 0, s2);
10072     assert(s2 != s1);
10073 
10074     char[] s3 = s1.dup;
10075     toLowerInPlace(s3);
10076     assert(s3 == s2);
10077 
10078     s1 = "A\u0100B\u0101d";
10079     s2 = toLower(s1);
10080     s3 = s1.dup;
10081     assert(cmp(s2, "a\u0101b\u0101d") == 0);
10082     assert(s2 !is s1);
10083     toLowerInPlace(s3);
10084     assert(s3 == s2);
10085 
10086     s1 = "A\u0460B\u0461d";
10087     s2 = toLower(s1);
10088     s3 = s1.dup;
10089     assert(cmp(s2, "a\u0461b\u0461d") == 0);
10090     assert(s2 !is s1);
10091     toLowerInPlace(s3);
10092     assert(s3 == s2);
10093 
10094     s1 = "\u0130";
10095     s2 = toLower(s1);
10096     s3 = s1.dup;
10097     assert(s2 == "i\u0307");
10098     assert(s2 !is s1);
10099     toLowerInPlace(s3);
10100     assert(s3 == s2);
10101 
10102     // Test on wchar and dchar strings.
10103     assert(toLower("Some String"w) == "some string"w);
10104     assert(toLower("Some String"d) == "some string"d);
10105 
10106     // https://issues.dlang.org/show_bug.cgi?id=12455
10107     dchar c = 'İ'; // '\U0130' LATIN CAPITAL LETTER I WITH DOT ABOVE
10108     assert(isUpper(c));
10109     assert(toLower(c) == 'i');
10110     // extends on https://issues.dlang.org/show_bug.cgi?id=12455 report
10111     // check simple-case toUpper too
10112     c = '\u1f87';
10113     assert(isLower(c));
10114     assert(toUpper(c) == '\u1F8F');
10115 }
10116 
10117 @safe pure unittest
10118 {
10119     import std.algorithm.comparison : cmp, equal;
10120     import std.utf : byCodeUnit;
10121     auto r1 = "FoL".byCodeUnit;
10122     assert(r1.toLower.cmp("fol") == 0);
10123     auto r2 = "A\u0460B\u0461d".byCodeUnit;
10124     assert(r2.toLower.cmp("a\u0461b\u0461d") == 0);
10125 }
10126 
10127 /++
10128     If `c` is a Unicode lowercase $(CHARACTER), then its uppercase equivalent
10129     is returned. Otherwise `c` is returned.
10130 
10131     Warning:
10132     Certain alphabets like German and Greek have no 1:1
10133     upper-lower mapping. Use overload of toUpper which takes full string instead.
10134 
10135     toUpper can be used as an argument to $(REF map, std,algorithm,iteration)
10136     to produce an algorithm that can convert a range of characters to upper case
10137     without allocating memory.
10138     A string can then be produced by using $(REF copy, std,algorithm,mutation)
10139     to send it to an $(REF appender, std,array).
10140 +/
10141 @safe pure nothrow @nogc
10142 dchar toUpper(dchar c)
10143 {
10144     // optimize ASCII case
10145     if (c < 0xAA)
10146     {
10147         if (c < 'a')
10148             return c;
10149         if (c <= 'z')
10150             return c - 32;
10151         return c;
10152     }
10153     size_t idx = toUpperSimpleIndex(c);
10154     if (idx != ushort.max)
10155     {
10156         return toUpperTab(idx);
10157     }
10158     return c;
10159 }
10160 
10161 ///
10162 @safe unittest
10163 {
10164     import std.algorithm.iteration : map;
10165     import std.algorithm.mutation : copy;
10166     import std.array : appender;
10167 
10168     auto abuf = appender!(char[])();
10169     "hello".map!toUpper.copy(abuf);
10170     assert(abuf.data == "HELLO");
10171 }
10172 
10173 @safe unittest
10174 {
10175     static import std.ascii;
10176     import std.format : format;
10177     foreach (ch; 0 .. 0x80)
10178         assert(std.ascii.toUpper(ch) == toUpper(ch));
10179     assert(toUpper('я') == 'Я');
10180     assert(toUpper('δ') == 'Δ');
10181     auto title = unicode.Titlecase_Letter;
10182     foreach (ch; unicode.lowerCase.byCodepoint)
10183     {
10184         dchar up = ch.toUpper();
10185         assert(up == ch || isUpper(up) || title[up],
10186             format("%x -> %x", ch, up));
10187     }
10188 }
10189 
10190 /++
10191     Allocates a new array which is identical to `s` except that all of its
10192     characters are converted to uppercase (by performing Unicode uppercase mapping).
10193     If none of `s` characters were affected, then `s` itself is returned if `s`
10194     is a `string`-like type.
10195 
10196     Params:
10197         s = A $(REF_ALTTEXT random access range, isRandomAccessRange, std,range,primitives)
10198         of characters
10199     Returns:
10200         An new array with the same element type as `s`.
10201 +/
10202 ElementEncodingType!S[] toUpper(S)(return scope S s) @trusted
10203 if (isSomeString!S)
10204 {
10205     static import std.ascii;
10206     return toCase!(UpperTriple, std.ascii.toUpper)(s);
10207 }
10208 
10209 /// ditto
10210 ElementEncodingType!S[] toUpper(S)(S s)
10211 if (!isSomeString!S && (isRandomAccessRange!S && hasLength!S && hasSlicing!S && isSomeChar!(ElementType!S)))
10212 {
10213     static import std.ascii;
10214     return toCase!(UpperTriple, std.ascii.toUpper)(s);
10215 }
10216 
10217 // overloads for the most common cases to reduce compile time
10218 @safe pure /*TODO nothrow*/
10219 {
10220     string toUpper(return scope string s)
10221     { return toUpper!string(s); }
10222     wstring toUpper(return scope wstring s)
10223     { return toUpper!wstring(s); }
10224     dstring toUpper(return scope dstring s)
10225     { return toUpper!dstring(s); }
10226 
10227     @safe unittest
10228     {
10229         // https://issues.dlang.org/show_bug.cgi?id=16663
10230 
10231         static struct String
10232         {
10233             string data;
10234             alias data this;
10235         }
10236 
10237         void foo()
10238         {
10239             auto u = toUpper(String(""));
10240         }
10241     }
10242 }
10243 
10244 @safe unittest
10245 {
10246     import std.algorithm.comparison : cmp;
10247 
10248     string s1 = "FoL";
10249     string s2;
10250     char[] s3;
10251 
10252     s2 = toUpper(s1);
10253     s3 = s1.dup; toUpperInPlace(s3);
10254     assert(s3 == s2, s3);
10255     assert(cmp(s2, "FOL") == 0);
10256     assert(s2 !is s1);
10257 
10258     s1 = "a\u0100B\u0101d";
10259     s2 = toUpper(s1);
10260     s3 = s1.dup; toUpperInPlace(s3);
10261     assert(s3 == s2);
10262     assert(cmp(s2, "A\u0100B\u0100D") == 0);
10263     assert(s2 !is s1);
10264 
10265     s1 = "a\u0460B\u0461d";
10266     s2 = toUpper(s1);
10267     s3 = s1.dup; toUpperInPlace(s3);
10268     assert(s3 == s2);
10269     assert(cmp(s2, "A\u0460B\u0460D") == 0);
10270     assert(s2 !is s1);
10271 }
10272 
10273 @safe unittest
10274 {
10275     static void doTest(C)(const(C)[] s, const(C)[] trueUp, const(C)[] trueLow)
10276     {
10277         import std.format : format;
10278         string diff = "src: %( %x %)\nres: %( %x %)\ntru: %( %x %)";
10279         auto low = s.toLower() , up = s.toUpper();
10280         auto lowInp = s.dup, upInp = s.dup;
10281         lowInp.toLowerInPlace();
10282         upInp.toUpperInPlace();
10283         assert(low == trueLow, format(diff, low, trueLow));
10284         assert(up == trueUp,  format(diff, up, trueUp));
10285         assert(lowInp == trueLow,
10286             format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) lowInp, cast(const(ubyte)[]) trueLow));
10287         assert(upInp == trueUp,
10288             format(diff, cast(const(ubyte)[]) s, cast(const(ubyte)[]) upInp, cast(const(ubyte)[]) trueUp));
10289     }
10290     static foreach (S; AliasSeq!(dstring, wstring, string))
10291     {{
10292 
10293         S easy = "123";
10294         S good = "abCФеж";
10295         S awful = "\u0131\u023f\u2126";
10296         S wicked = "\u0130\u1FE2";
10297         auto options = [easy, good, awful, wicked];
10298         S[] lower = ["123", "abcфеж", "\u0131\u023f\u03c9", "i\u0307\u1Fe2"];
10299         S[] upper = ["123", "ABCФЕЖ", "I\u2c7e\u2126", "\u0130\u03A5\u0308\u0300"];
10300 
10301         foreach (val; [easy, good])
10302         {
10303             auto e = val.dup;
10304             auto g = e;
10305             e.toUpperInPlace();
10306             assert(e is g);
10307             e.toLowerInPlace();
10308             assert(e is g);
10309         }
10310         foreach (i, v; options)
10311         {
10312             doTest(v, upper[i], lower[i]);
10313         }
10314 
10315         // a few combinatorial runs
10316         foreach (i; 0 .. options.length)
10317         foreach (j; i .. options.length)
10318         foreach (k; j .. options.length)
10319         {
10320             auto sample = options[i] ~ options[j] ~ options[k];
10321             auto sample2 = options[k] ~ options[j] ~ options[i];
10322             doTest(sample, upper[i] ~ upper[j] ~ upper[k],
10323                 lower[i] ~ lower[j] ~ lower[k]);
10324             doTest(sample2, upper[k] ~ upper[j] ~ upper[i],
10325                 lower[k] ~ lower[j] ~ lower[i]);
10326         }
10327     }}
10328 }
10329 
10330 // test random access ranges
10331 @safe pure unittest
10332 {
10333     import std.algorithm.comparison : cmp;
10334     import std.utf : byCodeUnit;
10335     auto s1 = "FoL".byCodeUnit;
10336     assert(s1.toUpper.cmp("FOL") == 0);
10337     auto s2 = "a\u0460B\u0461d".byCodeUnit;
10338     assert(s2.toUpper.cmp("A\u0460B\u0460D") == 0);
10339 }
10340 
10341 /++
10342     Returns whether `c` is a Unicode alphabetic $(CHARACTER)
10343     (general Unicode category: Alphabetic).
10344 +/
10345 @safe pure nothrow @nogc
10346 bool isAlpha(dchar c)
10347 {
10348     // optimization
10349     if (c < 0xAA)
10350     {
10351         return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z');
10352     }
10353 
10354     return alphaTrie[c];
10355 }
10356 
10357 @safe unittest
10358 {
10359     auto alpha = unicode("Alphabetic");
10360     foreach (ch; alpha.byCodepoint)
10361         assert(isAlpha(ch));
10362     foreach (ch; 0 .. 0x4000)
10363         assert((ch in alpha) == isAlpha(ch));
10364 }
10365 
10366 
10367 /++
10368     Returns whether `c` is a Unicode mark
10369     (general Unicode category: Mn, Me, Mc).
10370 +/
10371 @safe pure nothrow @nogc
10372 bool isMark(dchar c)
10373 {
10374     return markTrie[c];
10375 }
10376 
10377 @safe unittest
10378 {
10379     auto mark = unicode("Mark");
10380     foreach (ch; mark.byCodepoint)
10381         assert(isMark(ch));
10382     foreach (ch; 0 .. 0x4000)
10383         assert((ch in mark) == isMark(ch));
10384 }
10385 
10386 /++
10387     Returns whether `c` is a Unicode numerical $(CHARACTER)
10388     (general Unicode category: Nd, Nl, No).
10389 +/
10390 @safe pure nothrow @nogc
10391 bool isNumber(dchar c)
10392 {
10393     // optimization for ascii case
10394     if (c <= 0x7F)
10395     {
10396         return c >= '0' && c <= '9';
10397     }
10398     else
10399     {
10400         return numberTrie[c];
10401     }
10402 }
10403 
10404 @safe unittest
10405 {
10406     auto n = unicode("N");
10407     foreach (ch; n.byCodepoint)
10408         assert(isNumber(ch));
10409     foreach (ch; 0 .. 0x4000)
10410         assert((ch in n) == isNumber(ch));
10411 }
10412 
10413 /++
10414     Returns whether `c` is a Unicode alphabetic $(CHARACTER) or number.
10415     (general Unicode category: Alphabetic, Nd, Nl, No).
10416 
10417     Params:
10418         c = any Unicode character
10419     Returns:
10420         `true` if the character is in the Alphabetic, Nd, Nl, or No Unicode
10421         categories
10422 +/
10423 @safe pure nothrow @nogc
10424 bool isAlphaNum(dchar c)
10425 {
10426     static import std.ascii;
10427 
10428     // optimization for ascii case
10429     if (std.ascii.isASCII(c))
10430     {
10431         return std.ascii.isAlphaNum(c);
10432     }
10433     else
10434     {
10435         return isAlpha(c) || isNumber(c);
10436     }
10437 }
10438 
10439 @safe unittest
10440 {
10441     auto n = unicode("N");
10442     auto alpha = unicode("Alphabetic");
10443 
10444     foreach (ch; n.byCodepoint)
10445         assert(isAlphaNum(ch));
10446 
10447     foreach (ch; alpha.byCodepoint)
10448         assert(isAlphaNum(ch));
10449 
10450     foreach (ch; 0 .. 0x4000)
10451     {
10452         assert(((ch in n) || (ch in alpha)) == isAlphaNum(ch));
10453     }
10454 }
10455 
10456 /++
10457     Returns whether `c` is a Unicode punctuation $(CHARACTER)
10458     (general Unicode category: Pd, Ps, Pe, Pc, Po, Pi, Pf).
10459 +/
10460 @safe pure nothrow @nogc
10461 bool isPunctuation(dchar c)
10462 {
10463     static import std.ascii;
10464 
10465     // optimization for ascii case
10466     if (c <= 0x7F)
10467     {
10468         return std.ascii.isPunctuation(c);
10469     }
10470     else
10471     {
10472         return punctuationTrie[c];
10473     }
10474 }
10475 
10476 @safe unittest
10477 {
10478     assert(isPunctuation('\u0021'));
10479     assert(isPunctuation('\u0028'));
10480     assert(isPunctuation('\u0029'));
10481     assert(isPunctuation('\u002D'));
10482     assert(isPunctuation('\u005F'));
10483     assert(isPunctuation('\u00AB'));
10484     assert(isPunctuation('\u00BB'));
10485     foreach (ch; unicode("P").byCodepoint)
10486         assert(isPunctuation(ch));
10487 }
10488 
10489 /++
10490     Returns whether `c` is a Unicode symbol $(CHARACTER)
10491     (general Unicode category: Sm, Sc, Sk, So).
10492 +/
10493 @safe pure nothrow @nogc
10494 bool isSymbol(dchar c)
10495 {
10496    return symbolTrie[c];
10497 }
10498 
10499 @safe unittest
10500 {
10501     import std.format : format;
10502     assert(isSymbol('\u0024'));
10503     assert(isSymbol('\u002B'));
10504     assert(isSymbol('\u005E'));
10505     assert(isSymbol('\u00A6'));
10506     foreach (ch; unicode("S").byCodepoint)
10507         assert(isSymbol(ch), format("%04x", ch));
10508 }
10509 
10510 /++
10511     Returns whether `c` is a Unicode space $(CHARACTER)
10512     (general Unicode category: Zs)
10513     Note: This doesn't include '\n', '\r', \t' and other non-space $(CHARACTER).
10514     For commonly used less strict semantics see $(LREF isWhite).
10515 +/
10516 @safe pure nothrow @nogc
10517 bool isSpace(dchar c)
10518 {
10519     import std.internal.unicode_tables : isSpaceGen; // generated file
10520     return isSpaceGen(c);
10521 }
10522 
10523 @safe unittest
10524 {
10525     assert(isSpace('\u0020'));
10526     auto space = unicode.Zs;
10527     foreach (ch; space.byCodepoint)
10528         assert(isSpace(ch));
10529     foreach (ch; 0 .. 0x1000)
10530         assert(isSpace(ch) == space[ch]);
10531 }
10532 
10533 
10534 /++
10535     Returns whether `c` is a Unicode graphical $(CHARACTER)
10536     (general Unicode category: L, M, N, P, S, Zs).
10537 
10538 +/
10539 @safe pure nothrow @nogc
10540 bool isGraphical(dchar c)
10541 {
10542     return graphicalTrie[c];
10543 }
10544 
10545 
10546 @safe unittest
10547 {
10548     auto set = unicode("Graphical");
10549     import std.format : format;
10550     foreach (ch; set.byCodepoint)
10551         assert(isGraphical(ch), format("%4x", ch));
10552     foreach (ch; 0 .. 0x4000)
10553         assert((ch in set) == isGraphical(ch));
10554 }
10555 
10556 
10557 /++
10558     Returns whether `c` is a Unicode control $(CHARACTER)
10559     (general Unicode category: Cc).
10560 +/
10561 @safe pure nothrow @nogc
10562 bool isControl(dchar c)
10563 {
10564     import std.internal.unicode_tables : isControlGen; // generated file
10565     return isControlGen(c);
10566 }
10567 
10568 @safe unittest
10569 {
10570     assert(isControl('\u0000'));
10571     assert(isControl('\u0081'));
10572     assert(!isControl('\u0100'));
10573     auto cc = unicode.Cc;
10574     foreach (ch; cc.byCodepoint)
10575         assert(isControl(ch));
10576     foreach (ch; 0 .. 0x1000)
10577         assert(isControl(ch) == cc[ch]);
10578 }
10579 
10580 
10581 /++
10582     Returns whether `c` is a Unicode formatting $(CHARACTER)
10583     (general Unicode category: Cf).
10584 +/
10585 @safe pure nothrow @nogc
10586 bool isFormat(dchar c)
10587 {
10588     import std.internal.unicode_tables : isFormatGen; // generated file
10589     return isFormatGen(c);
10590 }
10591 
10592 
10593 @safe unittest
10594 {
10595     assert(isFormat('\u00AD'));
10596     foreach (ch; unicode("Format").byCodepoint)
10597         assert(isFormat(ch));
10598 }
10599 
10600 // code points for private use, surrogates are not likely to change in near feature
10601 // if need be they can be generated from unicode data as well
10602 
10603 /++
10604     Returns whether `c` is a Unicode Private Use $(CODEPOINT)
10605     (general Unicode category: Co).
10606 +/
10607 @safe pure nothrow @nogc
10608 bool isPrivateUse(dchar c)
10609 {
10610     return (0x00_E000 <= c && c <= 0x00_F8FF)
10611         || (0x0F_0000 <= c && c <= 0x0F_FFFD)
10612         || (0x10_0000 <= c && c <= 0x10_FFFD);
10613 }
10614 
10615 /++
10616     Returns whether `c` is a Unicode surrogate $(CODEPOINT)
10617     (general Unicode category: Cs).
10618 +/
10619 @safe pure nothrow @nogc
10620 bool isSurrogate(dchar c)
10621 {
10622     return (0xD800 <= c && c <= 0xDFFF);
10623 }
10624 
10625 /++
10626     Returns whether `c` is a Unicode high surrogate (lead surrogate).
10627 +/
10628 @safe pure nothrow @nogc
10629 bool isSurrogateHi(dchar c)
10630 {
10631     return (0xD800 <= c && c <= 0xDBFF);
10632 }
10633 
10634 /++
10635     Returns whether `c` is a Unicode low surrogate (trail surrogate).
10636 +/
10637 @safe pure nothrow @nogc
10638 bool isSurrogateLo(dchar c)
10639 {
10640     return (0xDC00 <= c && c <= 0xDFFF);
10641 }
10642 
10643 /++
10644     Returns whether `c` is a Unicode non-character i.e.
10645     a $(CODEPOINT) with no assigned abstract character.
10646     (general Unicode category: Cn)
10647 +/
10648 @safe pure nothrow @nogc
10649 bool isNonCharacter(dchar c)
10650 {
10651     return nonCharacterTrie[c];
10652 }
10653 
10654 @safe unittest
10655 {
10656     auto set = unicode("Cn");
10657     foreach (ch; set.byCodepoint)
10658         assert(isNonCharacter(ch));
10659 }
10660 
10661 private:
10662 // load static data from pre-generated tables into usable datastructures
10663 
10664 
10665 @safe auto asSet(const (ubyte)[] compressed) pure
10666 {
10667     return CodepointSet.fromIntervals(decompressIntervals(compressed));
10668 }
10669 
10670 @safe pure nothrow auto asTrie(T...)(const scope TrieEntry!T e)
10671 {
10672     return const(CodepointTrie!T)(e.offsets, e.sizes, e.data);
10673 }
10674 
10675 @safe pure nothrow @nogc @property
10676 {
10677     // It's important to use auto return here, so that the compiler
10678     // only runs semantic on the return type if the function gets
10679     // used. Also these are functions rather than templates to not
10680     // increase the object size of the caller.
10681     auto lowerCaseTrie() { static immutable res = asTrie(lowerCaseTrieEntries); return res; }
10682     auto upperCaseTrie() { static immutable res = asTrie(upperCaseTrieEntries); return res; }
10683     auto simpleCaseTrie() { static immutable res = asTrie(simpleCaseTrieEntries); return res; }
10684     auto fullCaseTrie() { static immutable res = asTrie(fullCaseTrieEntries); return res; }
10685     auto alphaTrie() { static immutable res = asTrie(alphaTrieEntries); return res; }
10686     auto markTrie() { static immutable res = asTrie(markTrieEntries); return res; }
10687     auto numberTrie() { static immutable res = asTrie(numberTrieEntries); return res; }
10688     auto punctuationTrie() { static immutable res = asTrie(punctuationTrieEntries); return res; }
10689     auto symbolTrie() { static immutable res = asTrie(symbolTrieEntries); return res; }
10690     auto graphicalTrie() { static immutable res = asTrie(graphicalTrieEntries); return res; }
10691     auto nonCharacterTrie() { static immutable res = asTrie(nonCharacterTrieEntries); return res; }
10692 
10693     //normalization quick-check tables
10694     auto nfcQCTrie()
10695     {
10696         import std.internal.unicode_norm : nfcQCTrieEntries;
10697         static immutable res = asTrie(nfcQCTrieEntries);
10698         return res;
10699     }
10700 
10701     auto nfdQCTrie()
10702     {
10703         import std.internal.unicode_norm : nfdQCTrieEntries;
10704         static immutable res = asTrie(nfdQCTrieEntries);
10705         return res;
10706     }
10707 
10708     auto nfkcQCTrie()
10709     {
10710         import std.internal.unicode_norm : nfkcQCTrieEntries;
10711         static immutable res = asTrie(nfkcQCTrieEntries);
10712         return res;
10713     }
10714 
10715     auto nfkdQCTrie()
10716     {
10717         import std.internal.unicode_norm : nfkdQCTrieEntries;
10718         static immutable res = asTrie(nfkdQCTrieEntries);
10719         return res;
10720     }
10721 
10722     //grapheme breaking algorithm tables
10723     auto spacingMarkTrie()
10724     {
10725         import std.internal.unicode_grapheme : spacingMarkTrieEntries;
10726         static immutable res = asTrie(spacingMarkTrieEntries);
10727         return res;
10728     }
10729 
10730     auto graphemeExtendTrie()
10731     {
10732         import std.internal.unicode_grapheme : graphemeExtendTrieEntries;
10733         static immutable res = asTrie(graphemeExtendTrieEntries);
10734         return res;
10735     }
10736 
10737     auto hangLV()
10738     {
10739         import std.internal.unicode_grapheme : hangulLVTrieEntries;
10740         static immutable res = asTrie(hangulLVTrieEntries);
10741         return res;
10742     }
10743 
10744     auto hangLVT()
10745     {
10746         import std.internal.unicode_grapheme : hangulLVTTrieEntries;
10747         static immutable res = asTrie(hangulLVTTrieEntries);
10748         return res;
10749     }
10750 
10751     auto prependTrie()
10752     {
10753         import std.internal.unicode_grapheme : prependTrieEntries;
10754         static immutable res = asTrie(prependTrieEntries);
10755         return res;
10756     }
10757 
10758     auto graphemeControlTrie()
10759     {
10760         import std.internal.unicode_grapheme : controlTrieEntries;
10761         static immutable res = asTrie(controlTrieEntries);
10762         return res;
10763     }
10764 
10765     auto xpictoTrie()
10766     {
10767         import std.internal.unicode_grapheme : Extended_PictographicTrieEntries;
10768         static immutable res = asTrie(Extended_PictographicTrieEntries);
10769         return res;
10770     }
10771 
10772     // tables below are used for composition/decomposition
10773     auto combiningClassTrie()
10774     {
10775         import std.internal.unicode_comp : combiningClassTrieEntries;
10776         static immutable res = asTrie(combiningClassTrieEntries);
10777         return res;
10778     }
10779 
10780     auto compatMappingTrie()
10781     {
10782         import std.internal.unicode_decomp : compatMappingTrieEntries;
10783         static immutable res = asTrie(compatMappingTrieEntries);
10784         return res;
10785     }
10786 
10787     auto canonMappingTrie()
10788     {
10789         import std.internal.unicode_decomp : canonMappingTrieEntries;
10790         static immutable res = asTrie(canonMappingTrieEntries);
10791         return res;
10792     }
10793 
10794     auto compositionJumpTrie()
10795     {
10796         import std.internal.unicode_comp : compositionJumpTrieEntries;
10797         static immutable res = asTrie(compositionJumpTrieEntries);
10798         return res;
10799     }
10800 
10801     //case conversion tables
10802     auto toUpperIndexTrie() { static immutable res = asTrie(toUpperIndexTrieEntries); return res; }
10803     auto toLowerIndexTrie() { static immutable res = asTrie(toLowerIndexTrieEntries); return res; }
10804     auto toTitleIndexTrie() { static immutable res = asTrie(toTitleIndexTrieEntries); return res; }
10805     //simple case conversion tables
10806     auto toUpperSimpleIndexTrie() { static immutable res = asTrie(toUpperSimpleIndexTrieEntries); return res; }
10807     auto toLowerSimpleIndexTrie() { static immutable res = asTrie(toLowerSimpleIndexTrieEntries); return res; }
10808     auto toTitleSimpleIndexTrie() { static immutable res = asTrie(toTitleSimpleIndexTrieEntries); return res; }
10809 
10810 }
10811 
10812 }// version (!std_uni_bootstrap)