1 /**
2 This is a tool to automatically generate source code for unicode data structures.
3 
4 If not present, the script will automatically try to download the files from: https://www.unicode.org/Public
5 
6 Make sure the current working directory is the /tools folder.
7 
8 To update `std.internal.unicode*.d` files, run:
9 ```
10 rdmd -m32 unicode_table_generator.d
11 rdmd -m64 unicode_table_generator.d --min
12 ```
13 
14 The -m32 run will replace the files, while the -m64 run with --min will append 64-bit specific parts.
15 The 32-bit compilation of the generator is needed because it depends on 32-bit data structures defined in `std.uni`.
16 To make -m32 work on linux, you may need to grab a 32-bit `libphobos2.a` from `dmd2/linux/lib32` and pass it as argument:
17 
18 ```
19 rdmd -m32 -Llibphobos2.a -defaultlib= unicode_table_generator.d
20 ```
21 
22 Pull Requests to untangle this complex bootstrap process are welcome! :)
23 
24 TODO: Support emitting of Turkic casefolding mappings
25 
26 Authors: Dmitry Olshansky
27 
28 License: Boost
29 */
30 module std.unicode_table_generator;
31 // this shouldn't be in std package, but stuff is package'd as that in std.uni.
32 
33 /// Directory in which unicode files are downloaded
34 enum unicodeDir = "ucd-15-1-0/";
35 
36 /// Url from which unicode files are downloaded
37 enum unicodeBaseUrl = "https://www.unicode.org/Public/15.1.0/ucd/";
38 
39 /// Where to put generated files
40 enum outputDir = "../std/internal/";
41 
42 import std.uni, std.stdio, std.traits, std.typetuple,
43     std.exception, std.format, std.algorithm, std.typecons,
44     std.regex, std.range, std.conv, std.getopt, std.path;
45 
46 import std.file : exists;
47 static import std.ascii, std.string;
48 
49 //common binary property sets and their aliases
50 struct PropertyTable
51 {
52     CodepointSet[string] table;
53     string[string] aliases;
54 }
55 
56 PropertyTable general;
57 PropertyTable blocks;
58 PropertyTable scripts;
59 PropertyTable hangul;
60 PropertyTable graphemeBreaks;
61 PropertyTable emojiData;
62 
63 //quick NO/MAYBE charaсter sets
64 CodepointSet[string] normalization;
65 
66 //axuilary sets for case mapping
67 CodepointSet lowerCaseSet, upperCaseSet;
68 //CodepointSet titleCaseSet; //no sensible version found for isTitlecase
69 
70 // sets for toLower/toUpper/toTitle
71 uint[] toLowerTab;
72 ushort toLowerTabSimpleLen; //start of long mappings
73 ushort[dchar] toLowerSimpleIndex, toLowerIndex;
74 uint[] toUpperTab;
75 ushort toUpperTabSimpleLen; //ditto for Upper
76 ushort[dchar] toUpperSimpleIndex, toUpperIndex;
77 uint[] toTitleTab;
78 ushort toTitleTabSimpleLen; //ditto for Title
79 ushort[dchar] toTitleSimpleIndex, toTitleIndex;
80 
81 mixin(mixedCCEntry);
82 
83 //case folding mapping
84 SimpleCaseEntry[] simpleTable;
85 FullCaseEntry[] fullTable;
86 
87 ///canonical combining class
88 CodepointSet[256] combiningClass;
89 //same but packaged per dchar
90 ubyte[dchar] combiningMapping;
91 
92 //unrolled decompositions
93 dstring[dchar] canonDecomp;
94 dstring[dchar] compatDecomp;
95 
96 //canonical composition tables
97 dchar[] canonicalyComposableLeft;
98 dchar[] canonicalyComposableRight;
99 
100 
101 //canonical composition exclusions
102 CodepointSet compExclusions;
103 
104 //property names to discard
105 string[] blacklist = [];
106 
107 struct FullCaseEntry
108 {
109     dchar[3] seq = 0;
110     ubyte n; /// number in batch
111     ubyte size; /// size - size of batch
112     ubyte entry_len;
113 
114     auto value() const @safe pure nothrow @nogc return
115     {
116         return seq[0 .. entry_len];
117     }
118 }
119 
120 /// 8 byte easy SimpleCaseEntry, will be compressed to SCE which bit packs values to 4 bytes
121 struct SimpleCaseEntry
122 {
123     uint ch;
124     ubyte n; // number in bucket
125     ubyte size;
126     bool isLower;
127     bool isUpper;
128 }
129 
130 enum mixedCCEntry = `
131 /// Simple Case Entry, wrapper around uint to extract bit fields from simpleCaseTable()
132 struct SCE
133 {
134     uint x;
135 
136     nothrow @nogc pure @safe:
137 
138     this(uint x)
139     {
140         this.x = x;
141     }
142 
143     this(uint ch, ubyte n, ubyte size)
144     {
145         this.x = ch | n << 20 | size << 24;
146     }
147 
148     int ch() const { return this.x & 0x1FFFF; }
149     int n() const { return (this.x >> 20) & 0xF; }
150     int size() const { return this.x >> 24; }
151 }
152 
153 /// Bit backed FullCaseEntry
154 struct FCE
155 {
156     ulong x; // bit field sizes: 18, 12, 12, 4, 4, 4
157 
158 nothrow @nogc pure @safe:
159 
160     this(ulong x)
161     {
162         this.x = x;
163     }
164 
165     this(dchar[3] seq, ubyte n, ubyte size, ubyte entry_len)
166     {
167         this.x = ulong(seq[0]) << 36 | ulong(seq[1]) << 24 | seq[2] << 12 | n << 8 | size << 4 | entry_len << 0;
168     }
169 
170     dchar[3] seq() const { return [(x >> 36) & 0x1FFFF, (x >> 24) & 0xFFF, (x >> 12) & 0xFFF]; }
171     ubyte n() const { return (x >> 8) & 0xF; }
172     ubyte size() const { return (x >> 4) & 0xF; }
173     ubyte entry_len() const { return (x >> 0) & 0xF; }
174 }
175 
176 struct UnicodeProperty
177 {
178     string name;
179     ubyte[] compressed;
180 }
181 
182 struct TrieEntry(T...)
183 {
184     immutable(size_t)[] offsets;
185     immutable(size_t)[] sizes;
186     immutable(size_t)[] data;
187 }
188 
189 `;
190 
191 auto fullCaseEntry(dstring value, ubyte num, ubyte batch_size)
192 {
193     dchar[3] val = 0;
194     val[0 .. value.length] = value[];
195     return FullCaseEntry(val, num, batch_size, cast(ubyte) value.length);
196 }
197 
198 enum {
199     caseFoldingSrc = "CaseFolding.txt",
200     blocksSrc = "Blocks.txt",
201     propListSrc = "PropList.txt",
202     graphemeSrc = "auxiliary/GraphemeBreakProperty.txt",
203     emojiDataSrc = "emoji/emoji-data.txt",
204     propertyValueAliases = "PropertyValueAliases.txt",
205     corePropSrc = "DerivedCoreProperties.txt",
206     normalizationPropSrc = "DerivedNormalizationProps.txt",
207     scriptsSrc = "Scripts.txt",
208     hangulSyllableSrc = "HangulSyllableType.txt",
209     unicodeDataSrc = "UnicodeData.txt",
210     compositionExclusionsSrc = "CompositionExclusions.txt",
211     specialCasingSrc = "SpecialCasing.txt"
212 }
213 
214 void ensureFilesAreDownloaded()
215 {
216     import std.file : write, mkdirRecurse;
217     import std.net.curl : download;
218     import std.path : dirName, buildPath;
219 
220     string[] files = [
221         caseFoldingSrc,
222         blocksSrc,
223         propListSrc,
224         graphemeSrc,
225         emojiDataSrc,
226         propertyValueAliases,
227         corePropSrc,
228         normalizationPropSrc,
229         scriptsSrc,
230         hangulSyllableSrc,
231         unicodeDataSrc,
232         compositionExclusionsSrc,
233         specialCasingSrc
234     ];
235 
236     mkdirRecurse(unicodeDir);
237     foreach (string file; files)
238     {
239         string dest = unicodeDir.buildPath(file);
240         if (exists(dest))
241             continue;
242         if (file.canFind("/"))
243         {
244             mkdirRecurse(dest.dirName);
245         }
246         writeln("downloading ", unicodeBaseUrl ~ file);
247         download(unicodeBaseUrl ~ file, dest);
248     }
249 }
250 
251 enum HeaderComment = `//Written in the D programming language
252 /**
253  * License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0).
254  *
255  * Authors: Dmitry Olshansky
256  *
257  */
258 // !!! DO NOT EDIT !!!
259 // !!! Did you even read the comment? !!!
260 // This module is automatically generated from Unicode Character Database files
261 // https://github.com/dlang/phobos/blob/master/tools/unicode_table_generator.d
262 //dfmt off`;
263 
264 auto toPairs(K, V)(V[K] aa)
265 {
266     return aa.values.zip(aa.keys).array;
267 }
268 
269 void main(string[] argv)
270 {
271     string mode = "w";
272 
273     bool minimal = false;
274     getopt(argv, "min", &minimal);
275     if (minimal)
276         mode = "a";
277 
278     ensureFilesAreDownloaded();
279 
280     auto baseSink = File(outputDir.buildPath("unicode_tables.d"), mode);
281     auto compSink = File(outputDir.buildPath("unicode_comp.d"), mode);
282     auto decompSink = File(outputDir.buildPath("unicode_decomp.d"), mode);
283     auto normSink = File(outputDir.buildPath("unicode_norm.d"), mode);
284     auto graphSink = File(outputDir.buildPath("unicode_grapheme.d"), mode);
285     if (!minimal)
286     {
287         baseSink.writeln(HeaderComment);
288         baseSink.writeln("module std.internal.unicode_tables;");
289         baseSink.writeln("\n@safe pure nothrow @nogc package(std):\n");
290 
291         compSink.writeln("module std.internal.unicode_comp;");
292         compSink.writeln("import std.internal.unicode_tables;");
293         compSink.writeln("\n@safe pure nothrow @nogc package(std):\n");
294 
295         decompSink.writeln(HeaderComment);
296         decompSink.writeln("module std.internal.unicode_decomp;");
297         decompSink.writeln("import std.internal.unicode_tables;");
298         decompSink.writeln("\n@safe pure nothrow @nogc package(std):\n");
299 
300         normSink.writeln(HeaderComment);
301         normSink.writeln("module std.internal.unicode_norm;");
302         normSink.writeln("import std.internal.unicode_tables;");
303         normSink.writeln("\npackage(std):\n");
304 
305         graphSink.writeln(HeaderComment);
306         graphSink.writeln("module std.internal.unicode_grapheme;");
307         graphSink.writeln("import std.internal.unicode_tables;");
308         graphSink.writeln("\npackage(std):\n");
309     }
310 
311     loadBlocks(unicodeDir.buildPath(blocksSrc), blocks);
312     loadProperties(unicodeDir.buildPath(propListSrc), general);
313     loadProperties(unicodeDir.buildPath(corePropSrc), general);
314     loadProperties(unicodeDir.buildPath(scriptsSrc), scripts);
315     loadProperties(unicodeDir.buildPath(hangulSyllableSrc), hangul);
316     loadProperties(unicodeDir.buildPath(graphemeSrc), graphemeBreaks);
317     loadProperties(unicodeDir.buildPath(emojiDataSrc), emojiData);
318     loadPropertyAliases(unicodeDir.buildPath(propertyValueAliases));
319 
320     loadUnicodeData(unicodeDir.buildPath(unicodeDataSrc));
321     loadSpecialCasing(unicodeDir.buildPath(specialCasingSrc));
322     loadExclusions(unicodeDir.buildPath(compositionExclusionsSrc));
323     loadCaseFolding(unicodeDir.buildPath(caseFoldingSrc));
324     loadNormalization(unicodeDir.buildPath(normalizationPropSrc));
325 
326     static void writeTableOfSets(File sink, string prefix, PropertyTable tab)
327     {
328         sink.writeln();
329         writeAliasTable(sink, prefix, tab);
330     }
331 
332     if (!minimal)
333     {
334         writeCaseFolding(baseSink);
335         writeTableOfSets(baseSink, "uniProps", general);
336         writeTableOfSets(baseSink, "blocks", blocks);
337         writeTableOfSets(baseSink, "scripts", scripts);
338         writeTableOfSets(baseSink, "hangul", hangul);
339         writeFunctions(baseSink);
340     }
341 
342     static void trieProlog(File file)
343     {
344         file.writefln("\nstatic if (size_t.sizeof == %d)\n{", size_t.sizeof);
345     }
346 
347     static void trieEpilog(File file)
348     {
349         file.writeln("
350 }\n");
351     }
352 
353     trieProlog(baseSink);
354     trieProlog(compSink);
355     trieProlog(decompSink);
356     trieProlog(graphSink);
357     trieProlog(normSink);
358 
359     writeTries(baseSink);
360     writeNormalizationTries(normSink);
361     writeGraphemeTries(graphSink);
362     writeCaseCoversion(baseSink);
363     writeCombining(compSink);
364     writeDecomposition(decompSink);
365     writeCompositionTable(compSink);
366 
367     trieEpilog(decompSink);
368     trieEpilog(compSink);
369     trieEpilog(baseSink);
370     trieEpilog(graphSink);
371     trieEpilog(normSink);
372 }
373 
374 void scanUniData(alias Fn)(string name, Regex!char r)
375 {
376     File f = File(name);
377     foreach (line; f.byLine)
378     {
379         auto m = match(line, r);
380         if (!m.empty)
381             Fn(m);
382     }
383 }
384 
385 void loadCaseFolding(string f)
386 {
387     dchar[dchar] simple;
388     dstring[dchar] full;
389 
390     auto r = regex("([^;]*); ([CFST]);\\s*([^;]*);");
391     scanUniData!((m){
392             auto s1 = m.captures[1];
393             auto code = m.captures[2].front;
394             auto s2 = m.captures[3];
395             auto left = parse!int(s1, 16);
396             if (code == 'C')
397             {
398                 auto right = parse!int(s2, 16);
399                 simple[left] = right;
400                 full[left] = [right];
401             }
402             else if (code == 'S')
403             {
404                 auto right = parse!int(s2, 16);
405                 simple[left] = right;
406             }
407             else if (code == 'F')
408             {
409                 dstring right;
410                 foreach (x; match(s2, regex("[0-9A-Fa-f]+", "g")))
411                 {
412                     right ~= to!int(x[0], 16);
413                 }
414                 full[left] = right.idup;
415             }
416             else if (code == 'T')
417             {
418                 // TODO: ignore Turkic languages for now.
419             }
420     })(f, r);
421 
422     //make some useful sets by hand
423 
424     lowerCaseSet = general.table["Lowercase"];
425     upperCaseSet = general.table["Uppercase"];
426     //titleCaseSet = general.table["Lt"];
427 
428     foreach (ch; simple.keys())
429     {
430         dchar[8] entry;
431         int size=0;
432         entry[size++] = ch;
433         dchar x = simple[ch];
434         entry[size++] = x;
435         //simple is many:1 mapping
436         foreach (key, v; simple)
437         {
438             if (v == x && !canFind(entry[], key))
439             {
440                 entry[size++] = key;
441             }
442         }
443         sort(entry[0 .. size]);
444         foreach (i, value; entry[0 .. size])
445         {
446             simpleTable ~= SimpleCaseEntry(
447                 value,
448                 cast(ubyte) i,
449                 cast(ubyte) size,
450                 cast(bool) (value in lowerCaseSet),
451                 cast(bool) (value in upperCaseSet)
452             );
453         }
454     }
455 
456     foreach (ch; full.keys())
457     {
458         dstring[8] entry;
459         int size=0;
460         entry[size++] = [ch];
461         auto x = full[ch];
462         entry[size++] = x;
463 
464         //full is many:many mapping
465         //sort single-char versions, and let them come first
466         foreach (key, v; full)
467         {
468             if (v == x && !canFind(entry[], [key]))
469             {
470                 entry[size++] = [key];
471             }
472 
473         }
474         auto right = partition!(a => a.length == 1)(entry[0 .. size]);
475         sort(entry[0 .. size - right.length]);
476         foreach (i, value; entry[0 .. size])
477         {
478             fullTable ~= fullCaseEntry(value, cast(ubyte) i, cast(ubyte) size);
479         }
480     }
481 }
482 
483 void loadBlocks(string f, ref PropertyTable target)
484 {
485     auto r = regex(`^([0-9A-F]+)\.\.([0-9A-F]+);\s*(.*)\s*$`);
486     scanUniData!((m){
487             auto s1 = m.captures[1];
488             auto s2 = m.captures[2];
489             auto a1 = parse!uint(s1, 16);
490             auto a2 = parse!uint(s2, 16);
491             //@@@BUG 6178 memory corruption with
492             //target[to!string(m.captures[3])] = CodepointSet(a1, a2+1);
493             auto set = CodepointSet(a1, a2+1);
494             target.table[to!string(m.captures[3])] = set;
495     })(f, r);
496 }
497 
498 void loadProperties(string inp, ref PropertyTable target)
499 {
500     auto acceptProp = (string name) => countUntil(blacklist, name) < 0  && !name.startsWith("Changes");
501     auto r = regex(`^(?:(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*([a-zA-Z_0-9]*)\s*#|# [a-zA-Z_0-9]+=([a-zA-Z_0-9]+))`);
502     string aliasStr;
503     auto set = CodepointSet.init;  //workaround @@@BUG 6178
504     scanUniData!((m){
505         auto name = to!string(m.captures[4]);
506         if (!acceptProp(name))
507             return;
508         if (!m.captures[5].empty)
509             aliasStr = to!string(m.captures[5]);
510         else if (!m.captures[1].empty)
511         {
512             auto sa = m.captures[1];
513             auto sb = m.captures[2];
514             uint a = parse!uint(sa, 16);
515             uint b = parse!uint(sb, 16);
516             if (name !in target.table)
517             {
518                 target.table[name] = set;
519             }
520             auto p = name in target.table;
521             p.add(a,b+1); // unicode lists [a, b] we need [a,b)
522             if (!aliasStr.empty)
523             {
524                 target.aliases[name] = aliasStr;
525                 aliasStr = "";
526             }
527         }
528         else if (!m.captures[3].empty)
529         {
530             auto sx = m.captures[3];
531             uint x = parse!uint(sx, 16);
532             if (name !in target.table)
533             {
534                 target.table[name] = set;
535             }
536             auto p = name in target.table;
537             *p |= x;
538             if (!aliasStr.empty)
539             {
540                 target.aliases[name] = aliasStr;
541                 aliasStr = "";
542             }
543         }
544     })(inp, r);
545 }
546 
547 void loadPropertyAliases(string inp)
548 {
549     auto r = regex(`^([\w0-9_]+)\s*;\s*([\w0-9_]+)\s*;\s*([\w0-9_]+)`);
550     scanUniData!((m){
551         auto type = m.captures[1];
552         auto target = m.captures[2].idup;
553         auto label = m.captures[3].idup;
554         if (target != label)
555         {
556             if (type == "blk")
557                 blocks.aliases[target] = label;
558             else if (type == "gc")
559                 general.aliases[target] = label;
560             else if (type == "sc")
561                 scripts.aliases[target] = label;
562         }
563     })(inp, r);
564 }
565 
566 void loadNormalization(string inp)
567 {
568     auto r = regex(`^(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*(NFK?[CD]_QC)\s*;\s*([NM])|#\s*[a-zA-Z_0-9]+=([a-zA-Z_0-9]+)`);
569     CodepointSet set; //workaround @@@BUG 6178
570     scanUniData!((m){
571         auto name = to!string(m.captures[4]) ~ to!string(m.captures[5]);
572         if (!m.captures[1].empty)
573         {
574             auto sa = m.captures[1];
575             auto sb = m.captures[2];
576             uint a = parse!uint(sa, 16);
577             uint b = parse!uint(sb, 16);
578             if (name !in normalization)
579             {
580                 normalization[name] = set;
581             }
582             auto p = name in normalization;
583             p.add(a,b+1);
584         }
585         else if (!m.captures[3].empty)
586         {
587             auto sx = m.captures[3];
588             uint x = parse!uint(sx, 16);
589             if (name !in normalization)
590             {
591                 normalization[name] = set;
592             }
593             auto p = name in normalization;
594             *p |= x;
595         }
596     })(inp, r);
597 }
598 
599 void loadUnicodeData(string inp)
600 {
601     auto f = File(inp);
602 
603     dchar characterStart;
604     bool haveRangeStart, haveRangeEnd;
605 
606     CodepointSet all;
607     foreach (line; f.byLine)
608     {
609         auto fields = split(line, ";");
610         //codepoint, name, General_Category, Canonical_Combining_Class, Bidi_Class,
611         //Decomp_Type&Mapping, upper case mapping, lower case mapping, title case mapping
612         auto codepoint = fields[0];
613         auto decomp = fields[5];
614         auto name = fields[1];
615 
616         dchar src = parse!uint(codepoint, 16);
617 
618         if (name.endsWith("First>"))
619         {
620             assert(!haveRangeStart);
621             characterStart = src;
622             haveRangeStart = true;
623             continue;
624         }
625         else if (name.endsWith("Last>"))
626         {
627             haveRangeStart = false;
628             haveRangeEnd = true;
629         }
630         else
631         {
632             assert(!haveRangeStart);
633             haveRangeEnd = false;
634             characterStart = src;
635         }
636 
637         auto generalCategory = fields[2];
638         auto ccc = parse!uint(fields[3]);
639         auto upperCasePart = fields[12];
640         auto lowerCasePart = fields[13];
641         auto titleCasePart = fields[14];
642 
643         void appendCaseTab(ref ushort[dchar] index, ref uint[] chars, char[] casePart)
644         {
645             if (!casePart.empty)
646             {
647                 // if you have a range, you shouldn't have any casing provided
648                 assert(!haveRangeEnd);
649 
650                 uint ch = parse!uint(casePart, 16);
651                 chars ~= ch;
652                 assert(chars.length < ushort.max);
653                 index[src] = cast(ushort)(chars.length-1);
654             }
655         }
656 
657         if (generalCategory !in general.table)
658             general.table[generalCategory.idup] = CodepointSet.init;
659 
660         all.add(characterStart, src+1);
661         combiningClass[ccc].add(characterStart, src+1);
662         general.table[generalCategory].add(characterStart, src+1);
663 
664         appendCaseTab(toLowerSimpleIndex, toLowerTab, lowerCasePart);
665         appendCaseTab(toUpperSimpleIndex, toUpperTab, upperCasePart);
666         appendCaseTab(toTitleSimpleIndex, toTitleTab, titleCasePart);
667 
668         if (!decomp.empty)
669         {
670             // none of the ranges in UnicodeData.txt have decompositions provided
671             assert(!haveRangeEnd);
672 
673             //stderr.writeln(codepoint, " ---> ", decomp);
674 
675             dstring dest;
676             bool compat = false;
677             if (decomp.startsWith(" "))
678                 decomp = decomp[1..$];
679             if (decomp.front == '<')
680             {
681                 decomp = findSplitAfter(decomp, ">")[1];
682                 compat = true;
683             }
684             auto vals = split(decomp, " ");
685             foreach (v; vals)
686             {
687                 if (!v.empty)
688                     dest ~= cast(dchar) parse!uint(v, 16);
689             }
690             if (!compat)
691             {
692                 assert(dest.length <= 2, "cannonical decomposition has more then 2 codepoints?!");
693                 canonDecomp[src] = dest;
694             }
695             compatDecomp[src] = dest;
696         }
697     }
698     // compute Cn as all dchar we have not found in UnicodeData.txt
699     general.table["Cn"] = all.inverted;
700     general.aliases["Cn"] = "Unassigned";
701     auto arr = combiningClass[1 .. 255];
702     foreach (i, clazz; arr)//0 is a default for all of 1M+ codepoints
703     {
704         auto y = clazz.byCodepoint;
705         foreach (ch; y)
706             combiningMapping[ch] = cast(ubyte)(i+1);
707     }
708 }
709 
710 void loadSpecialCasing(string f)
711 {
712     {
713         toLowerTabSimpleLen = cast(ushort) toLowerTab.length;
714         toUpperTabSimpleLen = cast(ushort) toUpperTab.length;
715         toTitleTabSimpleLen = cast(ushort) toTitleTab.length;
716 
717         // duplicate the simple indexes prior to adding our uncondtional rules also
718         toLowerIndex = toLowerSimpleIndex.dup;
719         toTitleIndex = toTitleSimpleIndex.dup;
720         toUpperIndex = toUpperSimpleIndex.dup;
721     }
722 
723     auto file = File(f);
724     auto r = regex(`([0-9A-F]+(?:\s*[0-9A-F]+)+);`, "g");
725     foreach (line; file.byLine)
726     {
727         if (!line.empty && line[0] == '#')
728         {
729             if (line.canFind("Conditional Mappings"))
730             {
731                 // TODO: we kinda need the conditional mappings for languages like Turkish
732                 break;
733             }
734             else
735                 continue;
736         }
737 
738         auto entries = line.match(r);
739         if (entries.empty)
740             continue;
741         auto pieces = array(entries.map!"a[1]");
742         dchar ch = parse!uint(pieces[0], 16);
743         void processPiece(ref ushort[dchar] index, ref uint[] table, char[] piece)
744         {
745             uint[] mapped = piece.split
746                 .map!(x=>parse!uint(x, 16)).array;
747             if (mapped.length == 1)
748             {
749                 table ~= mapped[0];
750                 index[ch] = cast(ushort)(table.length - 1);
751             }
752             else
753             {
754                 ushort idx = cast(ushort) table.length;
755                 table ~= mapped;
756                 table[idx] |= (mapped.length << 24); //upper 8bits - length of sequence
757                 index[ch] = idx;
758             }
759         }
760 
761         // lower, title, upper
762         processPiece(toLowerIndex, toLowerTab, pieces[1]);
763         processPiece(toTitleIndex, toTitleTab, pieces[2]);
764         processPiece(toUpperIndex, toUpperTab, pieces[3]);
765     }
766 }
767 
768 auto recursivelyDecompose(dstring[dchar] decompTable)
769 {
770     //apply recursively:
771     dstring[dchar] full;
772     foreach (k, v; decompTable)
773     {
774         dstring old, decomp=v;
775         do
776         {
777             old = decomp;
778             decomp = "";
779             foreach (dchar ch; old)
780             {
781                 if (ch in decompTable)
782                     decomp ~= decompTable[ch];
783                 else
784                     decomp ~= ch;
785             }
786         }
787         while (old != decomp);
788         full[k] = decomp;
789     }
790     return full;
791 }
792 
793 void loadExclusions(string inp)
794 {
795     auto r = regex(`^([0-9A-F]+)`);
796     scanUniData!((m){
797         auto piece = m.captures[1];
798         uint a = parse!uint(piece, 16);
799         compExclusions |= cast(dchar) a;
800     })(inp, r);
801 }
802 
803 string charsetString(CodepointSet set, string sep=";\n")
804 {
805     auto app = appender!(char[])();
806     ubyte[] data = compressIntervals(set.byInterval);
807     assert(CodepointSet(decompressIntervals(data)) == set);
808     formattedWrite(app, "x\"%(%02X%)\";", data);
809     return cast(string) app.data;
810 }
811 
812 string identName(string s)
813 {
814     auto app = appender!(char[])();
815     foreach (c; s)
816     {
817         if (c == '-' || c == ' ')
818             app.put('_');
819         else
820             app.put(c);
821     }
822     return cast(string) app.data;
823 }
824 
825 string uniformName(string s)
826 {
827     auto app = appender!(char[])();
828     foreach (c; s)
829     {
830         if (c != '-' && c != ' ' && c != '_')
831             app.put(std.ascii.toLower(c));
832     }
833     return cast(string) app.data;
834 }
835 
836 void writeSets(File sink, PropertyTable src)
837 {
838     with(sink)
839     {
840         writeln("private alias _T = ubyte[];");
841         foreach (k, v; src.table)
842         {
843             writef("_T %s = ", identName(k));
844             writeln(charsetString(v));
845         }
846     }
847 }
848 
849 void writeAliasTable(File sink, string prefix, PropertyTable src)
850 {
851     with(sink)
852     {
853         writeln("struct ", prefix);
854         writeln("{");
855         writeln("private alias _U = immutable(UnicodeProperty);");
856         writeln("@property static _U[] tab() pure { return _tab; }");
857         writeln("static immutable:");
858         writeSets(sink, src);
859         writeln("_U[] _tab = [");
860     }
861     string[] lines;
862     string[] namesOnly;
863     auto app = appender!(char[])();
864     auto keys = src.table.keys;
865     foreach (k; keys)
866     {
867         formattedWrite(app, "_U(\"%s\", %s),\n", k, identName(k));
868         lines ~= app.data.idup;
869         namesOnly ~= uniformName(k);
870         app.shrinkTo(0);
871         if (k in src.aliases)
872         {
873             formattedWrite(app, "_U(\"%s\", %s),\n", src.aliases[k], identName(k));
874             lines ~= app.data.idup;
875             namesOnly ~= uniformName(src.aliases[k]);
876             app.shrinkTo(0);
877         }
878     }
879     static bool ucmp(T)(T a, T b) { return propertyNameLess(a[0], b[0]); }
880     sort!ucmp(zip(namesOnly, lines));
881 
882     with(sink)
883     {
884         foreach (i, v; lines)
885         {
886             write(lines[i]);
887         }
888         writeln("];");
889         writeln("}");
890     }
891 }
892 
893 void writeCaseFolding(File sink)
894 {
895     with(sink)
896     {
897         write(mixedCCEntry);
898 
899         writeln("SCE simpleCaseTable(size_t i)");
900         writeln("{");
901         writef("static immutable uint[] t = x\"");
902         foreach (i, v; simpleTable)
903         {
904             if (i % 12 == 0) writeln();
905             writef("%08X", SCE(v.ch, v.n, v.size).x);
906         }
907 
908         // Inspect max integer size, so efficient bit packing can be found:
909         stderr.writefln("max n: %X", simpleTable.maxElement!(x => x.n).n); // n: 2-bit
910         stderr.writefln("max ch: %X", simpleTable.maxElement!(x => x.ch).ch); // ch: 17-bit
911         stderr.writefln("max size: %X", simpleTable.maxElement!(x => x.size).size); // size: 3-bit
912 
913         writeln("\";");
914         writeln("return SCE(t[i]);");
915         writeln("}");
916         writeln("@property FCE fullCaseTable(size_t index) nothrow @nogc @safe pure");
917         writeln("{");
918         writef("static immutable ulong[] t = x\"");
919         int[4] maxS = 0;
920         foreach (i, v; fullTable)
921         {
922                 foreach (j; 0 .. v.entry_len)
923                     maxS[j] = max(maxS[j], v.value[j]);
924 
925                 if (v.entry_len > 1)
926                 {
927                     assert(v.n >= 1); // meaning that start of bucket is always single char
928                 }
929                 if (i % 6 == 0) writeln();
930                 writef("%016X", FCE(v.seq, v.n, v.size, v.entry_len).x);
931         }
932         writeln("\";");
933         writeln("return FCE(t[index]);");
934         writeln("}");
935         import core.bitop : bsr;
936         stderr.writefln("max seq bits: [%d, %d, %d]", 1 + bsr(maxS[0]), 1 + bsr(maxS[1]), 1 + bsr(maxS[2])); //[17, 11, 10]
937         stderr.writefln("max n = %d", fullTable.map!(x => x.n).maxElement); // 3
938         stderr.writefln("max size = %d", fullTable.map!(x => x.size).maxElement); // 4
939         stderr.writefln("max entry_len = %d", fullTable.map!(x => x.entry_len).maxElement); // 3
940     }
941 }
942 
943 void writeTries(File sink)
944 {
945     ushort[dchar] simpleIndices;
946     foreach (i, v; array(map!(x => x.ch)(simpleTable)))
947         simpleIndices[v] = cast(ushort) i;
948 
949     ushort[dchar] fullIndices;
950     foreach (i, v; fullTable)
951     {
952         if (v.entry_len == 1)
953             fullIndices[v.seq[0]] = cast(ushort) i;
954     }
955 
956     //these 2 only for verification of Trie code itself
957     auto st = codepointTrie!(ushort, 12, 9)(
958         zip(simpleIndices.values, simpleIndices.keys).array, ushort.max);
959     auto ft = codepointTrie!(ushort, 12, 9)(
960         zip(fullIndices.values, fullIndices.keys).array, ushort.max);
961 
962     foreach (k, v; simpleIndices)
963     {
964         assert(st[k] == simpleIndices[k]);
965     }
966 
967     foreach (k, v; fullIndices)
968     {
969         assert(ft[k] == fullIndices[k]);
970     }
971 
972     writeBest3Level(sink, "lowerCase", lowerCaseSet);
973     writeBest3Level(sink, "upperCase", upperCaseSet);
974     //writeBest3Level("titleCase", titleCaseSet);
975     writeBest3Level(sink, "simpleCase", simpleIndices, ushort.max);
976     writeBest3Level(sink, "fullCase", fullIndices, ushort.max);
977 
978     //common isXXX properties
979     auto props = general.table;
980     CodepointSet alpha = props["Alphabetic"]; //it includes some numbers, symbols & marks
981     CodepointSet mark = props["Mn"] | props["Me"] | props["Mc"];
982     CodepointSet number = props["Nd"] | props["Nl"] | props["No"];
983     CodepointSet punctuation = props["Pd"] | props["Ps"] | props["Pe"]
984          | props["Pc"] | props["Po"] | props["Pi"] | props["Pf"];
985     CodepointSet symbol = props["Sm"] | props["Sc"] | props["Sk"] | props["So"];
986     CodepointSet graphical = alpha | mark | number | punctuation | symbol | props["Zs"];
987     CodepointSet nonCharacter = props["Cn"];
988 
989 
990     writeBest3Level(sink, "alpha", alpha);
991     writeBest3Level(sink, "mark", mark);
992     writeBest3Level(sink, "number", number);
993     writeBest3Level(sink, "punctuation", punctuation);
994     writeBest3Level(sink, "symbol", symbol);
995     writeBest3Level(sink, "graphical", graphical);
996     writeBest4Level(sink, "nonCharacter", nonCharacter);
997 
998 }
999 
1000 void writeNormalizationTries(File sink)
1001 {
1002     CodepointSet nfcQC = normalization["NFC_QCN"] | normalization["NFC_QCM"];
1003     CodepointSet nfdQC = normalization["NFD_QCN"];
1004     CodepointSet nfkcQC = normalization["NFKC_QCN"] | normalization["NFKC_QCM"];
1005     CodepointSet nfkdQC = normalization["NFKD_QCN"];
1006     writeBest3Level(sink, "nfcQC", nfcQC);
1007     writeBest3Level(sink, "nfdQC", nfdQC);
1008     writeBest3Level(sink, "nfkcQC", nfkcQC);
1009     writeBest3Level(sink, "nfkdQC", nfkdQC);
1010 }
1011 
1012 void writeGraphemeTries(File sink)
1013 {
1014     //few specifics for grapheme cluster breaking algorithm
1015     //
1016     auto props = general.table;
1017     writeBest3Level(sink, "hangulLV", hangul.table["LV"]);
1018     writeBest3Level(sink, "hangulLVT", hangul.table["LVT"]);
1019 
1020     // Grapheme specific information
1021     writeBest3Level(sink, "prepend", graphemeBreaks.table["Prepend"]);
1022     writeBest3Level(sink, "control", graphemeBreaks.table["Control"]);
1023 
1024     // We use Grapheme_Cluster_Break=SpacingMark instead of GC=Mc,
1025     //  Grapheme_Cluster_Break=SpacingMark is derived from GC=Mc and includes other general category values
1026     writeBest3Level(sink, "spacingMark", graphemeBreaks.table["SpacingMark"]);
1027 
1028     // We use the Grapheme_Cluster_Break=Extend instead of Grapheme_Extend,
1029     //  Grapheme_Cluster_Break=Extend is derived from Grapheme_Extend and is more complete
1030     writeBest3Level(sink, "graphemeExtend", graphemeBreaks.table["Extend"]);
1031 
1032     // emoji related data
1033     writeBest3Level(sink, "Extended_Pictographic", emojiData.table["Extended_Pictographic"]);
1034 }
1035 
1036 /// Write a function that returns a dchar[] with data stored in `table`
1037 void writeDstringTable(File sink, string name, const dchar[] table)
1038 {
1039     sink.writefln("dstring %s() nothrow @nogc pure @safe {\nstatic immutable dchar[%d] t =", name, table.length);
1040     sink.writeDstring(table);
1041     sink.writeln(";\nreturn t[];\n}");
1042 }
1043 
1044 /// Write a function that returns a uint[] with data stored in `table`
1045 void writeUintTable(File sink, string name, const uint[] table)
1046 {
1047     sink.writefln("immutable(uint)[] %s() nothrow @nogc pure @safe {\nstatic immutable uint[] t =", name, );
1048     sink.writeUintArray(table);
1049     sink.writeln(";\nreturn t;\n}");
1050 }
1051 
1052 void writeCaseCoversion(File sink)
1053 {
1054     {
1055         sink.writefln("enum MAX_SIMPLE_LOWER = %d;", toLowerTabSimpleLen);
1056         sink.writefln("enum MAX_SIMPLE_UPPER = %d;", toUpperTabSimpleLen);
1057         sink.writefln("enum MAX_SIMPLE_TITLE = %d;", toTitleTabSimpleLen);
1058     }
1059 
1060     {
1061         // these are case mappings that also utilize the unconditional SpecialCasing.txt rules
1062         writeBest3Level(sink, "toUpperIndex", toUpperIndex, ushort.max);
1063         writeBest3Level(sink, "toLowerIndex", toLowerIndex, ushort.max);
1064         writeBest3Level(sink, "toTitleIndex", toTitleIndex, ushort.max);
1065     }
1066 
1067     {
1068         // these are all case mapping tables that are 1:1 acquired from UnicodeData.txt
1069         writeBest3Level(sink, "toUpperSimpleIndex", toUpperSimpleIndex, ushort.max);
1070         writeBest3Level(sink, "toLowerSimpleIndex", toLowerSimpleIndex, ushort.max);
1071         writeBest3Level(sink, "toTitleSimpleIndex", toTitleSimpleIndex, ushort.max);
1072     }
1073 
1074     writeUintTable(sink, "toUpperTable", toUpperTab);
1075     writeUintTable(sink, "toLowerTable", toLowerTab);
1076     writeUintTable(sink, "toTitleTable", toTitleTab);
1077 }
1078 
1079 void writeDecomposition(File sink)
1080 {
1081     auto fullCanon = recursivelyDecompose(canonDecomp);
1082     auto fullCompat = recursivelyDecompose(compatDecomp);
1083     dstring decompCanonFlat = "\0"~array(fullCanon.values).sort.uniq.join("\0")~"\0";
1084     dstring decompCompatFlat = "\0"~array(fullCompat.values).sort.uniq.join("\0")~"\0";
1085     stderr.writeln("Canon flattened: ", decompCanonFlat.length);
1086     stderr.writeln("Compat flattened: ", decompCompatFlat.length);
1087 
1088     ushort[dchar] mappingCanon;
1089     ushort[dchar] mappingCompat;
1090     //0 serves as doesn't decompose value
1091     foreach (k, v; fullCanon)
1092     {
1093         size_t idx = decompCanonFlat.countUntil(v~"\0");
1094         enforce(idx != 0);
1095         enforce(decompCanonFlat[idx .. idx+v.length] == v);
1096         mappingCanon[k] = cast(ushort) idx;
1097     }
1098     foreach (k, v; fullCompat)
1099     {
1100         size_t idx = decompCompatFlat.countUntil(v~"\0");
1101         enforce(idx != 0);
1102         enforce(decompCompatFlat[idx .. idx+v.length] == v);
1103         mappingCompat[k] = cast(ushort) idx;
1104     }
1105     enforce(decompCanonFlat.length < 2^^16);
1106     enforce(decompCompatFlat.length < 2^^16);
1107 
1108     //these 2 are just self-test for Trie template code
1109     auto compatRange = zip(mappingCompat.values, mappingCompat.keys).array;
1110     auto canonRange = zip(mappingCanon.values, mappingCanon.keys).array;
1111     auto compatTrie = codepointTrie!(ushort, 12, 9)(compatRange, 0);
1112     auto canonTrie =  codepointTrie!(ushort, 12, 9)(canonRange, 0);
1113     import std.string;
1114     foreach (k, v; fullCompat)
1115     {
1116         auto idx = compatTrie[k];
1117         enforce(idx == mappingCompat[k], "failed on compat");
1118         size_t len = decompCompatFlat[idx..$].countUntil(0);
1119         enforce(decompCompatFlat[idx .. idx+len] == v,
1120             format("failed on compat: '%( 0x0%5x %)' not found", v));
1121     }
1122     foreach (k, v; fullCanon)
1123     {
1124         auto idx = canonTrie[k];
1125         enforce(idx == mappingCanon[k], "failed on canon");
1126         size_t len = decompCanonFlat[idx..$].countUntil(0);
1127         enforce(decompCanonFlat[idx .. idx+len] == v,
1128             format("failed on canon: '%( 0x%5x %)' not found", v));
1129     }
1130 
1131     writeBest3Level(sink, "compatMapping", mappingCompat, cast(ushort) 0);
1132     writeBest3Level(sink, "canonMapping", mappingCanon, cast(ushort) 0);
1133 
1134     writeDstringTable(sink, "decompCanonTable", decompCanonFlat);
1135     writeDstringTable(sink, "decompCompatTable", decompCompatFlat);
1136 }
1137 
1138 void writeFunctions(File sink)
1139 {
1140     auto format = general.table["Cf"];
1141     auto space = general.table["Zs"];
1142     auto control = general.table["Cc"];
1143     auto whitespace = general.table["White_Space"];
1144 
1145     //hangul L, V, T
1146     auto hangL = hangul.table["L"];
1147     auto hangV = hangul.table["V"];
1148     auto hangT = hangul.table["T"];
1149     with(sink)
1150     {
1151         writeln(format.toSourceCode("isFormatGen"));
1152         writeln(control.toSourceCode("isControlGen"));
1153         writeln(space.toSourceCode("isSpaceGen"));
1154         writeln(whitespace.toSourceCode("isWhiteGen"));
1155         writeln(hangL.toSourceCode("isHangL"));
1156         writeln(hangV.toSourceCode("isHangV"));
1157         writeln(hangT.toSourceCode("isHangT"));
1158     }
1159 }
1160 
1161 /// Write a `dchar[]` as hex string
1162 void writeUintArray(T:dchar)(File sink, const T[] tab)
1163 {
1164     size_t lineCount = 1;
1165     sink.write("x\"");
1166     foreach (i, elem; tab)
1167     {
1168         if ((i % 12) == 0)
1169             sink.write("\n");
1170 
1171         sink.writef("%08X", elem);
1172     }
1173     sink.write("\"");
1174 }
1175 
1176 /// Write a `dchar[]` as a dstring ""d
1177 void writeDstring(T:dchar)(File sink, const T[] tab)
1178 {
1179     size_t lineCount = 1;
1180     sink.write("\"");
1181     foreach (elem; tab)
1182     {
1183         if (lineCount >= 110)
1184         {
1185             sink.write("\"d~\n\"");
1186             lineCount = 1;
1187         }
1188         if (elem >= 0x10FFFF)
1189         {
1190             // invalid dchar, but might have extra info bit-packed in upper bits
1191             sink.writef("\"d~cast(dchar) 0x%08X~\"", elem);
1192             lineCount += 25;
1193         }
1194         else if (elem <= 0xFFFF)
1195         {
1196             sink.writef("\\u%04X", elem);
1197             lineCount += 6;
1198         }
1199         else
1200         {
1201             sink.writef("\\U%08X", elem);
1202             lineCount += 10;
1203         }
1204     }
1205     sink.write("\"d");
1206 }
1207 
1208 
1209 void writeCompositionTable(File sink)
1210 {
1211     dchar[dstring] composeTab;
1212     //construct compositions table
1213     foreach (dchar k, dstring v; canonDecomp)
1214     {
1215         if (v.length != 2)//singleton
1216             continue;
1217         if (v[0] in combiningMapping) //non-starter
1218             continue;
1219         if (k in combiningMapping) //combines to non-starter
1220             continue;
1221         if (compExclusions[k]) // non-derivable exclusions
1222             continue;
1223         composeTab[v] = k;
1224     }
1225 
1226     Tuple!(dchar, dchar, dchar)[] triples;
1227     foreach (dstring key, dchar val; composeTab)
1228         triples ~= Tuple!(dchar, dchar, dchar)(key[0], key[1], val);
1229     multiSort!("a[0] < b[0]", "a[1] < b[1]")(triples);
1230     //map to the triplets array
1231     ushort[dchar] trimap;
1232     dchar old = triples[0][0];
1233     auto r = triples[];
1234     for (size_t idx = 0;;)
1235     {
1236         ptrdiff_t cnt = countUntil!(x => x[0] != old)(r);
1237         if (cnt == -1)//end of input
1238             cnt = r.length;
1239         assert(idx < 2048);
1240         assert(cnt < 32);
1241         trimap[old] = to!ushort(idx | (cnt << 11));
1242         idx += cnt;
1243         if (idx == triples.length)
1244             break;
1245         old = r[cnt][0];
1246         r = r[cnt..$];
1247     }
1248 
1249     auto triT = codepointTrie!(ushort, 12, 9)(trimap.toPairs, ushort.max);
1250     auto dupletes = triples.map!(x => tuple(x[1], x[2])).array;
1251     foreach (dstring key, dchar val; composeTab)
1252     {
1253         size_t pack = triT[key[0]];
1254         assert(pack != ushort.max);
1255         size_t idx = pack & ((1 << 11) - 1), cnt = pack >> 11;
1256         auto f = dupletes[idx .. idx+cnt].find!(x => x[0] == key[1]);
1257         assert(!f.empty);
1258         // & starts with the right value
1259         assert(f.front[1] == val);
1260     }
1261     with(sink)
1262     {
1263         writeln("enum composeIdxMask = (1 << 11) - 1, composeCntShift = 11;");
1264         write("enum compositionJumpTrieEntries = TrieEntry!(ushort, 12, 9)(");
1265         triT.storeTrie(sink.lockingTextWriter());
1266         writeln(");");
1267         writeDstringTable(sink, "compositionTable", dupletes.map!(x => only(x.expand)).joiner.array);
1268     }
1269 }
1270 
1271 void writeCombining(File sink)
1272 {
1273     auto ct = codepointTrie!(ubyte, 7, 5, 9)(combiningMapping.toPairs);
1274     foreach (i, clazz; combiningClass[1 .. 255])//0 is a default for all of 1M+ codepoints
1275     {
1276         foreach (ch; clazz.byCodepoint)
1277             assert(ct[ch] == i+1);
1278     }
1279     writeBest3Level(sink, "combiningClass", combiningMapping);
1280 }
1281 
1282 //fussy compare for unicode property names as per UTS-18
1283 int comparePropertyName(Char)(const(Char)[] a, const(Char)[] b)
1284 {
1285     for (;;)
1286     {
1287         while (!a.empty && (isWhite(a.front) || a.front == '-' || a.front =='_'))
1288         {
1289             a.popFront();
1290         }
1291         while (!b.empty && (isWhite(b.front) || b.front == '-' || b.front =='_'))
1292         {
1293             b.popFront();
1294         }
1295         if (a.empty)
1296             return b.empty ? 0 : -1;
1297         if (b.empty)
1298             return 1;
1299 		// names are all in ASCII either way though whitespace might be unicode
1300         auto ca = std.ascii.toLower(a.front), cb = std.ascii.toLower(b.front);
1301         if (ca > cb)
1302             return 1;
1303         else if ( ca < cb)
1304             return -1;
1305         a.popFront();
1306         b.popFront();
1307     }
1308 }
1309 
1310 bool propertyNameLess(Char)(const(Char)[] a, const(Char)[] b)
1311 {
1312     return comparePropertyName(a, b) < 0;
1313 }
1314 
1315 //meta helpers to generate and pick the best trie by size & levels
1316 
1317 void writeBest2Level(Set)(File sink, string name, Set set)
1318     if (isCodepointSet!Set)
1319 {
1320     alias List = TypeTuple!(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1321     size_t min = size_t.max;
1322     void delegate(File) write;
1323     foreach (lvl_1; List)
1324     {
1325         enum lvl_2 = 21-lvl_1;
1326         auto t = codepointSetTrie!(lvl_1, lvl_2)(set);
1327         if (t.bytes < min)
1328         {
1329             min = t.bytes;
1330             write = createPrinter!(lvl_1, lvl_2)(name, t);
1331         }
1332     }
1333     write(sink);
1334 }
1335 
1336 void writeBest2Level(V, K)(File sink, string name, V[K] map, V defValue=V.init)
1337 {
1338     alias List = TypeTuple!(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1339     size_t min = size_t.max;
1340     void delegate(File) write;
1341     auto range = zip(map.values, map.keys).array;
1342     foreach (lvl_1; List)
1343     {
1344         enum lvl_2 = 21-lvl_1;
1345         alias codepointTrie!(V, lvl_1, lvl_2) CurTrie;
1346         CurTrie t = CurTrie(range, defValue);
1347         if (t.bytes < min)
1348         {
1349             min = t.bytes;
1350             write = createPrinter!(lvl_1, lvl_2)(name, t);
1351         }
1352     }
1353     write(sink);
1354 }
1355 
1356 alias List_1 = TypeTuple!(4, 5, 6, 7, 8);
1357 
1358 auto writeBest3Level(Set)(File sink, string name, Set set)
1359     if (isCodepointSet!Set)
1360 {
1361     // access speed trumps size, power of 2 is faster to access
1362     // e.g. 9, 5, 7 is far slower then 8, 5, 8 because of how bits breakdown:
1363     // 8-5-8: indexes are 21-8 = 13 bits, 13-5 = 8 bits, fits into a byte
1364     // 9-5-7: indexes are 21-7 = 14 bits, 14-5 = 9 bits, doesn't fit into a byte (!)
1365 
1366     // e.g. 8-5-8 is one of hand picked that is a very close match
1367     // to the best packing
1368     void delegate(File) write;
1369 
1370     alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1371     size_t min = size_t.max;
1372     foreach (lvl_1; List_1)//to have the first stage index fit in byte
1373     {
1374         foreach (lvl_2; List)
1375         {
1376             static if (lvl_1 + lvl_2  <= 16)//so that 2nd stage fits in ushort
1377             {
1378                 enum lvl_3 = 21-lvl_2-lvl_1;
1379                 auto t = codepointSetTrie!(lvl_1, lvl_2, lvl_3)(set);
1380                 if (t.bytes < min)
1381                 {
1382                     min = t.bytes;
1383                     write = createPrinter!(lvl_1, lvl_2, lvl_3)(name, t);
1384                 }
1385             }
1386         }
1387     }
1388     write(sink);
1389 }
1390 
1391 void writeBest3Level(V, K)(File sink, string name, V[K] map, V defValue=V.init)
1392 {
1393     void delegate(File) write;
1394     alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1395     size_t min = size_t.max;
1396     auto range = zip(map.values, map.keys).array;
1397     foreach (lvl_1; List_1)//to have the first stage index fit in byte
1398     {
1399         foreach (lvl_2; List)
1400         {
1401             static if (lvl_1 + lvl_2  <= 16)// into ushort
1402             {
1403                 enum lvl_3 = 21-lvl_2-lvl_1;
1404                 auto t = codepointTrie!(V, lvl_1, lvl_2, lvl_3) (range, defValue);
1405                 if (t.bytes < min)
1406                 {
1407                     min = t.bytes;
1408                     write = createPrinter!(lvl_1, lvl_2, lvl_3)(name, t);
1409                 }
1410             }
1411         }
1412     }
1413     write(sink);
1414 }
1415 
1416 void writeBest4Level(Set)(File sink, string name, Set set)
1417 {
1418     alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1419     size_t min = size_t.max;
1420     void delegate(File) write;
1421     foreach (lvl_1; List_1)//to have the first stage index fit in byte
1422     {
1423         foreach (lvl_2; List)
1424         {
1425             foreach (lvl_3; List)
1426             {
1427                 static if (lvl_1 + lvl_2 + lvl_3  <= 16)
1428                 {
1429                     enum lvl_4 = 21-lvl_3-lvl_2-lvl_1;
1430                     auto t = codepointSetTrie!(lvl_1, lvl_2, lvl_3, lvl_4)(set);
1431                     if (t.bytes < min)
1432                     {
1433                         min = t.bytes;
1434                         write = createPrinter!(lvl_1, lvl_2, lvl_3, lvl_4)(name, t);
1435                     }
1436                 }
1437             }
1438         }
1439     }
1440     write(sink);
1441 }
1442 
1443 template createPrinter(Params...)
1444 {
1445     void delegate(File) createPrinter(T)(string name, T trie)
1446     {
1447         return (File sink){
1448             sink.writef("//%d bytes\nenum %sTrieEntries = TrieEntry!(%s",
1449                 trie.bytes, name, Unqual!(typeof(T.init[0])).stringof);
1450             foreach (lvl; Params[0..$])
1451                 sink.writef(", %d", lvl);
1452             sink.write(")(");
1453             trie.storeTrie(sink.lockingTextWriter());
1454             sink.writeln(");");
1455         };
1456     }
1457 }
1458 
1459 void storeTrie(T, O)(T trie, O sink)
1460 {
1461     import std.format.write : formattedWrite;
1462     void store(size_t[] arr)
1463     {
1464         formattedWrite(sink, "x\"");
1465         foreach (i; 0 .. arr.length)
1466         {
1467             static if (size_t.sizeof == 8)
1468             {
1469                 formattedWrite(sink, (i % 6) == 0 ? "\n" : "");
1470                 formattedWrite(sink, "%016X", arr[i]);
1471             }
1472             else
1473             {
1474                 formattedWrite(sink, (i % 12) == 0 ? "\n" : "");
1475                 formattedWrite(sink, "%08X", arr[i]);
1476             }
1477         }
1478         formattedWrite(sink, "\",\n");
1479     }
1480     // Access private members
1481     auto table = __traits(getMember, trie, "_table");
1482     store(__traits(getMember, table, "offsets"));
1483     store(__traits(getMember, table, "sz"));
1484     store(__traits(getMember, table, "storage"));
1485 }