1 //Written in the D programming language
2 /**
3     gen_uni is a tool to automatically generate source code for unicode data structures.
4 
5     To generate the tables use:
6     ```
7     $ rdmd -m32 unicode_table_generator.d
8     $ rdmd -m64 unicode_table_generator.d --min
9     ```
10 
11     See the global ``UnicodeDatabaseDirectory`` for the latest version of the Unicode database that was used to generate the tables.
12 
13     TODO: Support emitting of Turkic casefolding mappings
14 
15     Authors: Dmitry Olshansky
16 
17     License: Boost
18 */
19 module std.unicode_table_generator;
20 // this shouldn't be in std package, but stuff is package'd as that in std.uni.
21 
22 import std.uni, std.stdio, std.traits, std.typetuple,
23      std.exception, std.format, std.algorithm, std.typecons,
24      std.regex, std.range, std.conv, std.getopt;
25 
26 import std.file : exists;
27 static import std.ascii, std.string;
28 
29 //common binary property sets and their aliases
30 struct PropertyTable
31 {
32     CodepointSet[string] table;
33     string[string] aliases;
34 }
35 
36 PropertyTable general;
37 PropertyTable blocks;
38 PropertyTable scripts;
39 PropertyTable hangul;
40 PropertyTable graphemeBreaks;
41 PropertyTable emojiData;
42 
43 //quick NO/MAYBE charaсter sets
44 CodepointSet[string] normalization;
45 
46 //axuilary sets for case mapping
47 CodepointSet lowerCaseSet, upperCaseSet;
48 //CodepointSet titleCaseSet; //no sensible version found for isTitlecase
49 
50 // sets for toLower/toUpper/toTitle
51 uint[] toLowerTab;
52 ushort toLowerTabSimpleLen; //start of long mappings
53 ushort[dchar] toLowerSimpleIndex, toLowerIndex;
54 uint[] toUpperTab;
55 ushort toUpperTabSimpleLen; //ditto for Upper
56 ushort[dchar] toUpperSimpleIndex, toUpperIndex;
57 uint[] toTitleTab;
58 ushort toTitleTabSimpleLen; //ditto for Title
59 ushort[dchar] toTitleSimpleIndex, toTitleIndex;
60 
61 mixin(mixedCCEntry);
62 
63 //case folding mapping
64 SimpleCaseEntry[] simpleTable;
65 FullCaseEntry[] fullTable;
66 
67 ///canonical combining class
68 CodepointSet[256] combiningClass;
69 //same but packaged per dchar
70 ubyte[dchar] combiningMapping;
71 
72 //unrolled decompositions
73 dstring[dchar] canonDecomp;
74 dstring[dchar] compatDecomp;
75 
76 //canonical composition tables
77 dchar[] canonicalyComposableLeft;
78 dchar[] canonicalyComposableRight;
79 
80 
81 //canonical composition exclusions
82 CodepointSet compExclusions;
83 
84 //property names to discard
85 string[] blacklist = [];
86 
87 enum mixedCCEntry = `
88 struct SimpleCaseEntry
89 {
90     uint ch;
91     ubyte n, bucket;// n - number in bucket
92 
93 pure nothrow @nogc:
94 
95     @property ubyte size() const
96     {
97         return bucket & 0x3F;
98     }
99     @property auto isLower() const
100     {
101         return bucket & 0x40;
102     }
103     @property auto isUpper() const
104     {
105         return bucket & 0x80;
106     }
107 }
108 
109 struct FullCaseEntry
110 {
111     dchar[3] seq;
112     ubyte n, size;// n number in batch, size - size of batch
113     ubyte entry_len;
114 
115     @property auto value() const @trusted pure nothrow @nogc return
116     {
117         return seq[0 .. entry_len];
118     }
119 }
120 
121 struct CompEntry
122 {
123     dchar rhs, composed;
124 }
125 
126 struct UnicodeProperty
127 {
128     string name;
129     ubyte[] compressed;
130 }
131 
132 struct TrieEntry(T...)
133 {
134     size_t[] offsets;
135     size_t[] sizes;
136     size_t[] data;
137 }
138 
139 `;
140 
141 auto fullCaseEntry(dstring value, ubyte num, ubyte batch_size)
142 {
143     dchar[3] val;
144     val[0 .. value.length] = value[];
145     return FullCaseEntry(val, num, batch_size, cast(ubyte) value.length);
146 }
147 
148 enum {
149     UnicodeDatabaseDirectory = "ucd-15/",
150     caseFoldingSrc = UnicodeDatabaseDirectory ~ "CaseFolding.txt",
151     blocksSrc = UnicodeDatabaseDirectory ~ "Blocks.txt",
152     propListSrc = UnicodeDatabaseDirectory ~ "PropList.txt",
153     graphemeSrc = UnicodeDatabaseDirectory ~ "auxiliary/GraphemeBreakProperty.txt",
154     emojiDataSrc = UnicodeDatabaseDirectory ~ "emoji/emoji-data.txt",
155     propertyValueAliases = UnicodeDatabaseDirectory ~ "PropertyValueAliases.txt",
156     corePropSrc = UnicodeDatabaseDirectory ~ "DerivedCoreProperties.txt",
157     normalizationPropSrc = UnicodeDatabaseDirectory ~ "DerivedNormalizationProps.txt",
158     scriptsSrc = UnicodeDatabaseDirectory ~ "Scripts.txt",
159     hangulSyllableSrc = UnicodeDatabaseDirectory ~ "HangulSyllableType.txt",
160     unicodeDataSrc = UnicodeDatabaseDirectory ~ "UnicodeData.txt",
161     compositionExclusionsSrc = UnicodeDatabaseDirectory ~ "CompositionExclusions.txt",
162     specialCasingSrc = UnicodeDatabaseDirectory ~ "SpecialCasing.txt"
163 }
164 
165 enum HeaderComment = `//Written in the D programming language
166 /**
167  * License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0).
168  *
169  * Authors: Dmitry Olshansky
170  *
171  */
172 // !!! DO NOT EDIT !!!
173 // !!! Did you even read the comment? !!!
174 // This module is automatically generated from Unicode Character Database files
175 // https://github.com/dlang/phobos/blob/master/tools/unicode_table_generator.d
176 //dfmt off`;
177 
178 auto toPairs(K, V)(V[K] aa)
179 {
180     return aa.values.zip(aa.keys).array;
181 }
182 
183 void main(string[] argv)
184 {
185     string mode = "w";
186 
187     bool minimal = false;
188     getopt(argv, "min", &minimal);
189     if (minimal)
190         mode = "a";
191 
192 
193     if (!exists(UnicodeDatabaseDirectory))
194     {
195         writeln("Did you forget to download the Unicode database tables?");
196         writeln("Looking for them in: ", UnicodeDatabaseDirectory);
197         return;
198     }
199 
200     enum UnicodeTableDirectory = "../std/internal/";
201 
202     auto baseSink = File(UnicodeTableDirectory ~ "unicode_tables.d", mode);
203     auto compSink = File(UnicodeTableDirectory ~ "unicode_comp.d", mode);
204     auto decompSink = File(UnicodeTableDirectory ~ "unicode_decomp.d", mode);
205     auto normSink = File(UnicodeTableDirectory ~ "unicode_norm.d", mode);
206     auto graphSink = File(UnicodeTableDirectory ~ "unicode_grapheme.d", mode);
207     if (!minimal)
208     {
209         baseSink.writeln(HeaderComment);
210         baseSink.writeln("module std.internal.unicode_tables;");
211         baseSink.writeln("\n@safe pure nothrow @nogc package(std):\n");
212 
213         compSink.writeln("module std.internal.unicode_comp;");
214         compSink.writeln("import std.internal.unicode_tables;");
215         compSink.writeln("\n@safe pure nothrow @nogc package(std):\n");
216 
217         decompSink.writeln(HeaderComment);
218         decompSink.writeln("module std.internal.unicode_decomp;");
219         decompSink.writeln("import std.internal.unicode_tables;");
220         decompSink.writeln("\n@safe pure nothrow @nogc package(std):\n");
221 
222         normSink.writeln(HeaderComment);
223         normSink.writeln("module std.internal.unicode_norm;");
224         normSink.writeln("import std.internal.unicode_tables;");
225         normSink.writeln("\npackage(std):\n");
226 
227         graphSink.writeln(HeaderComment);
228         graphSink.writeln("module std.internal.unicode_grapheme;");
229         graphSink.writeln("import std.internal.unicode_tables;");
230         graphSink.writeln("\npackage(std):\n");
231     }
232 
233     loadBlocks(blocksSrc, blocks);
234     loadProperties(propListSrc, general);
235     loadProperties(corePropSrc, general);
236     loadProperties(scriptsSrc, scripts);
237     loadProperties(hangulSyllableSrc, hangul);
238     loadProperties(graphemeSrc, graphemeBreaks);
239     loadProperties(emojiDataSrc, emojiData);
240     loadPropertyAliases(propertyValueAliases);
241 
242     loadUnicodeData(unicodeDataSrc);
243     loadSpecialCasing(specialCasingSrc);
244     loadExclusions(compositionExclusionsSrc);
245     loadCaseFolding(caseFoldingSrc);
246     loadNormalization(normalizationPropSrc);
247 
248     static void writeTableOfSets(File sink, string prefix, PropertyTable tab)
249     {
250         sink.writeln();
251         writeAliasTable(sink, prefix, tab);
252     }
253 
254     if (!minimal)
255     {
256         writeCaseFolding(baseSink);
257         writeTableOfSets(baseSink, "uniProps", general);
258         writeTableOfSets(baseSink, "blocks", blocks);
259         writeTableOfSets(baseSink, "scripts", scripts);
260         writeTableOfSets(baseSink, "hangul", hangul);
261         writeFunctions(baseSink);
262     }
263 
264     static void trieProlog(File file)
265     {
266         file.writefln("\nstatic if (size_t.sizeof == %d)\n{", size_t.sizeof);
267     }
268 
269     static void trieEpilog(File file)
270     {
271         file.writeln("
272 }\n");
273     }
274 
275     trieProlog(baseSink);
276     trieProlog(compSink);
277     trieProlog(decompSink);
278     trieProlog(graphSink);
279     trieProlog(normSink);
280 
281     writeTries(baseSink);
282     writeNormalizationTries(normSink);
283     writeGraphemeTries(graphSink);
284     writeCaseCoversion(baseSink);
285     writeCombining(compSink);
286     writeDecomposition(decompSink);
287     writeCompositionTable(compSink);
288 
289     trieEpilog(decompSink);
290     trieEpilog(compSink);
291     trieEpilog(baseSink);
292     trieEpilog(graphSink);
293     trieEpilog(normSink);
294 }
295 
296 void scanUniData(alias Fn)(string name, Regex!char r)
297 {
298     File f = File(name);
299     foreach (line; f.byLine)
300     {
301         auto m = match(line, r);
302         if (!m.empty)
303             Fn(m);
304     }
305 }
306 
307 void loadCaseFolding(string f)
308 {
309     dchar[dchar] simple;
310     dstring[dchar] full;
311 
312     auto r = regex("([^;]*); ([CFST]);\\s*([^;]*);");
313     scanUniData!((m){
314             auto s1 = m.captures[1];
315             auto code = m.captures[2].front;
316             auto s2 = m.captures[3];
317             auto left = parse!int(s1, 16);
318             if (code == 'C')
319             {
320                 auto right = parse!int(s2, 16);
321                 simple[left] = right;
322                 full[left] = [right];
323             }
324             else if (code == 'S')
325             {
326                 auto right = parse!int(s2, 16);
327                 simple[left] = right;
328             }
329             else if (code == 'F')
330             {
331                 dstring right;
332                 foreach (x; match(s2, regex("[0-9A-Fa-f]+", "g")))
333                 {
334                     right ~= to!int(x[0], 16);
335                 }
336                 full[left] = right.idup;
337             }
338             else if (code == 'T')
339             {
340                 // TODO: ignore Turkic languages for now.
341             }
342     })(f, r);
343 
344     //make some useful sets by hand
345 
346     lowerCaseSet = general.table["Lowercase"];
347     upperCaseSet = general.table["Uppercase"];
348     //titleCaseSet = general.table["Lt"];
349 
350     foreach (ch; simple.keys())
351     {
352         dchar[8] entry;
353         int size=0;
354         entry[size++] = ch;
355         dchar x = simple[ch];
356         entry[size++] = x;
357         //simple is many:1 mapping
358         foreach (key, v; simple)
359         {
360             if (v == x && !canFind(entry[], key))
361             {
362                 entry[size++] = key;
363             }
364         }
365         sort(entry[0 .. size]);
366         foreach (i, value; entry[0 .. size])
367         {
368             auto withFlags = cast(ubyte) size | (value in lowerCaseSet ? 0x40 : 0)
369                  | (value in upperCaseSet ? 0x80 : 0);
370             simpleTable ~= SimpleCaseEntry(value, cast(ubyte) i,
371                 cast(ubyte) withFlags);
372         }
373     }
374 
375     foreach (ch; full.keys())
376     {
377         dstring[8] entry;
378         int size=0;
379         entry[size++] = [ch];
380         auto x = full[ch];
381         entry[size++] = x;
382 
383         //full is many:many mapping
384         //sort single-char versions, and let them come first
385         foreach (key, v; full)
386         {
387             if (v == x && !canFind(entry[], [key]))
388             {
389                 entry[size++] = [key];
390             }
391 
392         }
393         auto right = partition!(a => a.length == 1)(entry[0 .. size]);
394         sort(entry[0 .. size - right.length]);
395         foreach (i, value; entry[0 .. size])
396         {
397             fullTable ~= fullCaseEntry(value, cast(ubyte) i, cast(ubyte) size);
398         }
399     }
400 }
401 
402 void loadBlocks(string f, ref PropertyTable target)
403 {
404     auto r = regex(`^([0-9A-F]+)\.\.([0-9A-F]+);\s*(.*)\s*$`);
405     scanUniData!((m){
406             auto s1 = m.captures[1];
407             auto s2 = m.captures[2];
408             auto a1 = parse!uint(s1, 16);
409             auto a2 = parse!uint(s2, 16);
410             //@@@BUG 6178 memory corruption with
411             //target[to!string(m.captures[3])] = CodepointSet(a1, a2+1);
412             auto set = CodepointSet(a1, a2+1);
413             target.table[to!string(m.captures[3])] = set;
414     })(f, r);
415 }
416 
417 void loadProperties(string inp, ref PropertyTable target)
418 {
419     auto acceptProp = (string name) => countUntil(blacklist, name) < 0  && !name.startsWith("Changes");
420     auto r = regex(`^(?:(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*([a-zA-Z_0-9]*)\s*#|# [a-zA-Z_0-9]+=([a-zA-Z_0-9]+))`);
421     string aliasStr;
422     auto set = CodepointSet.init;  //workaround @@@BUG 6178
423     scanUniData!((m){
424         auto name = to!string(m.captures[4]);
425         if (!acceptProp(name))
426             return;
427         if (!m.captures[5].empty)
428             aliasStr = to!string(m.captures[5]);
429         else if (!m.captures[1].empty)
430         {
431             auto sa = m.captures[1];
432             auto sb = m.captures[2];
433             uint a = parse!uint(sa, 16);
434             uint b = parse!uint(sb, 16);
435             if (name !in target.table)
436             {
437                 target.table[name] = set;
438             }
439             auto p = name in target.table;
440             p.add(a,b+1); // unicode lists [a, b] we need [a,b)
441             if (!aliasStr.empty)
442             {
443                 target.aliases[name] = aliasStr;
444                 aliasStr = "";
445             }
446         }
447         else if (!m.captures[3].empty)
448         {
449             auto sx = m.captures[3];
450             uint x = parse!uint(sx, 16);
451             if (name !in target.table)
452             {
453                 target.table[name] = set;
454             }
455             auto p = name in target.table;
456             *p |= x;
457             if (!aliasStr.empty)
458             {
459                 target.aliases[name] = aliasStr;
460                 aliasStr = "";
461             }
462         }
463     })(inp, r);
464 }
465 
466 void loadPropertyAliases(string inp)
467 {
468     auto r = regex(`^([\w0-9_]+)\s*;\s*([\w0-9_]+)\s*;\s*([\w0-9_]+)`);
469     scanUniData!((m){
470         auto type = m.captures[1];
471         auto target = m.captures[2].idup;
472         auto label = m.captures[3].idup;
473         if (target != label)
474         {
475             if (type == "blk")
476                 blocks.aliases[target] = label;
477             else if (type == "gc")
478                 general.aliases[target] = label;
479             else if (type == "sc")
480                 scripts.aliases[target] = label;
481         }
482     })(inp, r);
483 }
484 
485 void loadNormalization(string inp)
486 {
487     auto r = regex(`^(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*(NFK?[CD]_QC)\s*;\s*([NM])|#\s*[a-zA-Z_0-9]+=([a-zA-Z_0-9]+)`);
488     CodepointSet set; //workaround @@@BUG 6178
489     scanUniData!((m){
490         auto name = to!string(m.captures[4]) ~ to!string(m.captures[5]);
491         if (!m.captures[1].empty)
492         {
493             auto sa = m.captures[1];
494             auto sb = m.captures[2];
495             uint a = parse!uint(sa, 16);
496             uint b = parse!uint(sb, 16);
497             if (name !in normalization)
498             {
499                 normalization[name] = set;
500             }
501             auto p = name in normalization;
502             p.add(a,b+1);
503         }
504         else if (!m.captures[3].empty)
505         {
506             auto sx = m.captures[3];
507             uint x = parse!uint(sx, 16);
508             if (name !in normalization)
509             {
510                 normalization[name] = set;
511             }
512             auto p = name in normalization;
513             *p |= x;
514         }
515     })(inp, r);
516 }
517 
518 void loadUnicodeData(string inp)
519 {
520     auto f = File(inp);
521 
522     dchar characterStart;
523     bool haveRangeStart, haveRangeEnd;
524 
525     CodepointSet all;
526     foreach (line; f.byLine)
527     {
528         auto fields = split(line, ";");
529         //codepoint, name, General_Category, Canonical_Combining_Class, Bidi_Class,
530         //Decomp_Type&Mapping, upper case mapping, lower case mapping, title case mapping
531         auto codepoint = fields[0];
532         auto decomp = fields[5];
533         auto name = fields[1];
534 
535         dchar src = parse!uint(codepoint, 16);
536 
537         if (name.endsWith("First>"))
538         {
539             assert(!haveRangeStart);
540             characterStart = src;
541             haveRangeStart = true;
542             continue;
543         }
544         else if (name.endsWith("Last>"))
545         {
546             haveRangeStart = false;
547             haveRangeEnd = true;
548         }
549         else
550         {
551             assert(!haveRangeStart);
552             haveRangeEnd = false;
553             characterStart = src;
554         }
555 
556         auto generalCategory = fields[2];
557         auto ccc = parse!uint(fields[3]);
558         auto upperCasePart = fields[12];
559         auto lowerCasePart = fields[13];
560         auto titleCasePart = fields[14];
561 
562         void appendCaseTab(ref ushort[dchar] index, ref uint[] chars, char[] casePart)
563         {
564             if (!casePart.empty)
565             {
566                 // if you have a range, you shouldn't have any casing provided
567                 assert(!haveRangeEnd);
568 
569                 uint ch = parse!uint(casePart, 16);
570                 chars ~= ch;
571                 assert(chars.length < ushort.max);
572                 index[src] = cast(ushort)(chars.length-1);
573             }
574         }
575 
576         if (generalCategory !in general.table)
577             general.table[generalCategory.idup] = CodepointSet.init;
578 
579         all.add(characterStart, src+1);
580         combiningClass[ccc].add(characterStart, src+1);
581         general.table[generalCategory].add(characterStart, src+1);
582 
583         appendCaseTab(toLowerSimpleIndex, toLowerTab, lowerCasePart);
584         appendCaseTab(toUpperSimpleIndex, toUpperTab, upperCasePart);
585         appendCaseTab(toTitleSimpleIndex, toTitleTab, titleCasePart);
586 
587         if (!decomp.empty)
588         {
589             // none of the ranges in UnicodeData.txt have decompositions provided
590             assert(!haveRangeEnd);
591 
592             //stderr.writeln(codepoint, " ---> ", decomp);
593 
594             dstring dest;
595             bool compat = false;
596             if (decomp.startsWith(" "))
597                 decomp = decomp[1..$];
598             if (decomp.front == '<')
599             {
600                 decomp = findSplitAfter(decomp, ">")[1];
601                 compat = true;
602             }
603             auto vals = split(decomp, " ");
604             foreach (v; vals)
605             {
606                 if (!v.empty)
607                     dest ~= cast(dchar) parse!uint(v, 16);
608             }
609             if (!compat)
610             {
611                 assert(dest.length <= 2, "cannonical decomposition has more then 2 codepoints?!");
612                 canonDecomp[src] = dest;
613             }
614             compatDecomp[src] = dest;
615         }
616     }
617     // compute Cn as all dchar we have not found in UnicodeData.txt
618     general.table["Cn"] = all.inverted;
619     general.aliases["Cn"] = "Unassigned";
620     auto arr = combiningClass[1 .. 255];
621     foreach (i, clazz; arr)//0 is a default for all of 1M+ codepoints
622     {
623         auto y = clazz.byCodepoint;
624         foreach (ch; y)
625             combiningMapping[ch] = cast(ubyte)(i+1);
626     }
627 }
628 
629 void loadSpecialCasing(string f)
630 {
631     {
632         toLowerTabSimpleLen = cast(ushort) toLowerTab.length;
633         toUpperTabSimpleLen = cast(ushort) toUpperTab.length;
634         toTitleTabSimpleLen = cast(ushort) toTitleTab.length;
635 
636         // duplicate the simple indexes prior to adding our uncondtional rules also
637         toLowerIndex = toLowerSimpleIndex.dup;
638         toTitleIndex = toTitleSimpleIndex.dup;
639         toUpperIndex = toUpperSimpleIndex.dup;
640     }
641 
642     auto file = File(f);
643     auto r = regex(`([0-9A-F]+(?:\s*[0-9A-F]+)+);`, "g");
644     foreach (line; file.byLine)
645     {
646         if (!line.empty && line[0] == '#')
647         {
648             if (line.canFind("Conditional Mappings"))
649             {
650                 // TODO: we kinda need the conditional mappings for languages like Turkish
651                 break;
652             }
653             else
654                 continue;
655         }
656 
657         auto entries = line.match(r);
658         if (entries.empty)
659             continue;
660         auto pieces = array(entries.map!"a[1]");
661         dchar ch = parse!uint(pieces[0], 16);
662         void processPiece(ref ushort[dchar] index, ref uint[] table, char[] piece)
663         {
664             uint[] mapped = piece.split
665                 .map!(x=>parse!uint(x, 16)).array;
666             if (mapped.length == 1)
667             {
668                 table ~= mapped[0];
669                 index[ch] = cast(ushort)(table.length - 1);
670             }
671             else
672             {
673                 ushort idx = cast(ushort) table.length;
674                 table ~= mapped;
675                 table[idx] |= (mapped.length << 24); //upper 8bits - length of sequence
676                 index[ch] = idx;
677             }
678         }
679 
680         // lower, title, upper
681         processPiece(toLowerIndex, toLowerTab, pieces[1]);
682         processPiece(toTitleIndex, toTitleTab, pieces[2]);
683         processPiece(toUpperIndex, toUpperTab, pieces[3]);
684     }
685 }
686 
687 auto recursivelyDecompose(dstring[dchar] decompTable)
688 {
689     //apply recursively:
690     dstring[dchar] full;
691     foreach (k, v; decompTable)
692     {
693         dstring old, decomp=v;
694         do
695         {
696             old = decomp;
697             decomp = "";
698             foreach (dchar ch; old)
699             {
700                 if (ch in decompTable)
701                     decomp ~= decompTable[ch];
702                 else
703                     decomp ~= ch;
704             }
705         }
706         while (old != decomp);
707         full[k] = decomp;
708     }
709     return full;
710 }
711 
712 void loadExclusions(string inp)
713 {
714     auto r = regex(`^([0-9A-F]+)`);
715     scanUniData!((m){
716         auto piece = m.captures[1];
717         uint a = parse!uint(piece, 16);
718         compExclusions |= cast(dchar) a;
719     })(inp, r);
720 }
721 
722 string charsetString(CodepointSet set, string sep=";\n")
723 {
724     auto app = appender!(char[])();
725     ubyte[] data = compressIntervals(set.byInterval);
726     assert(CodepointSet(decompressIntervals(data)) == set);
727     formattedWrite(app, "[%(0x%x, %)];", data);
728     return cast(string) app.data;
729 }
730 
731 string identName(string s)
732 {
733     auto app = appender!(char[])();
734     foreach (c; s)
735     {
736         if (c == '-' || c == ' ')
737             app.put('_');
738         else
739             app.put(c);
740     }
741     return cast(string) app.data;
742 }
743 
744 string uniformName(string s)
745 {
746     auto app = appender!(char[])();
747     foreach (c; s)
748     {
749         if (c != '-' && c != ' ' && c != '_')
750             app.put(std.ascii.toLower(c));
751     }
752     return cast(string) app.data;
753 }
754 
755 void writeSets(File sink, PropertyTable src)
756 {
757     with(sink)
758     {
759         writeln("private alias _T = ubyte[];");
760         foreach (k, v; src.table)
761         {
762             writef("_T %s = ", identName(k));
763             writeln(charsetString(v));
764         }
765     }
766 }
767 
768 void writeAliasTable(File sink, string prefix, PropertyTable src)
769 {
770     with(sink)
771     {
772         writeln("struct ", prefix);
773         writeln("{");
774         writeln("private alias _U = immutable(UnicodeProperty);");
775         writeln("@property static _U[] tab() pure { return _tab; }");
776         writeln("static immutable:");
777         writeSets(sink, src);
778         writeln("_U[] _tab = [");
779     }
780     string[] lines;
781     string[] namesOnly;
782     auto app = appender!(char[])();
783     auto keys = src.table.keys;
784     foreach (k; keys)
785     {
786         formattedWrite(app, "_U(\"%s\", %s),\n", k, identName(k));
787         lines ~= app.data.idup;
788         namesOnly ~= uniformName(k);
789         app.shrinkTo(0);
790         if (k in src.aliases)
791         {
792             formattedWrite(app, "_U(\"%s\", %s),\n", src.aliases[k], identName(k));
793             lines ~= app.data.idup;
794             namesOnly ~= uniformName(src.aliases[k]);
795             app.shrinkTo(0);
796         }
797     }
798     static bool ucmp(T)(T a, T b) { return propertyNameLess(a[0], b[0]); }
799     sort!ucmp(zip(namesOnly, lines));
800 
801     with(sink)
802     {
803         foreach (i, v; lines)
804         {
805             write(lines[i]);
806         }
807         writeln("];");
808         writeln("}");
809     }
810 }
811 
812 void writeCaseFolding(File sink)
813 {
814     with(sink)
815     {
816         write(mixedCCEntry);
817 
818         writeln("@property immutable(SimpleCaseEntry[]) simpleCaseTable()");
819         writeln("{");
820         writeln("alias SCE = SimpleCaseEntry;");
821         writeln("static immutable SCE[] t = [");
822         foreach (i, v; simpleTable)
823         {
824             writef("SCE(0x%04x, %s, 0x%0x),", v.ch,  v.n, v.bucket);
825             if (i % 4 == 0) writeln();
826         }
827         writeln("];");
828         writeln("return t;");
829         writeln("}");
830         static uint maxLen = 0;
831         writeln("@property immutable(FullCaseEntry[]) fullCaseTable()  nothrow @nogc @safe pure");
832         writeln("{");
833         writeln("alias FCE = FullCaseEntry;");
834         writeln("static immutable FCE[] t = [");
835         foreach (i, v; fullTable)
836         {
837                 maxLen = max(maxLen, v.entry_len);
838                 if (v.entry_len > 1)
839                 {
840                     assert(v.n >= 1); // meaning that start of bucket is always single char
841                 }
842                 writef("FCE(\"%s\", %s, %s, %s),", v.value, v.n, v.size, v.entry_len);
843                 if (i % 4 == 0) writeln();
844         }
845         writeln("];");
846         writeln("return t;");
847         writeln("}");
848         stderr.writefln("MAX FCF len = %d", maxLen);
849     }
850 }
851 
852 void writeTries(File sink)
853 {
854     ushort[dchar] simpleIndices;
855     foreach (i, v; array(map!(x => x.ch)(simpleTable)))
856         simpleIndices[v] = cast(ushort) i;
857 
858     ushort[dchar] fullIndices;
859     foreach (i, v; fullTable)
860     {
861         if (v.entry_len == 1)
862             fullIndices[v.seq[0]] = cast(ushort) i;
863     }
864 
865     //these 2 only for verification of Trie code itself
866     auto st = codepointTrie!(ushort, 12, 9)(
867         zip(simpleIndices.values, simpleIndices.keys).array, ushort.max);
868     auto ft = codepointTrie!(ushort, 12, 9)(
869         zip(fullIndices.values, fullIndices.keys).array, ushort.max);
870 
871     foreach (k, v; simpleIndices)
872     {
873         assert(st[k] == simpleIndices[k]);
874     }
875 
876     foreach (k, v; fullIndices)
877     {
878         assert(ft[k] == fullIndices[k]);
879     }
880 
881     writeBest3Level(sink, "lowerCase", lowerCaseSet);
882     writeBest3Level(sink, "upperCase", upperCaseSet);
883     //writeBest3Level("titleCase", titleCaseSet);
884     writeBest3Level(sink, "simpleCase", simpleIndices, ushort.max);
885     writeBest3Level(sink, "fullCase", fullIndices, ushort.max);
886 
887     //common isXXX properties
888     auto props = general.table;
889     CodepointSet alpha = props["Alphabetic"]; //it includes some numbers, symbols & marks
890     CodepointSet mark = props["Mn"] | props["Me"] | props["Mc"];
891     CodepointSet number = props["Nd"] | props["Nl"] | props["No"];
892     CodepointSet punctuation = props["Pd"] | props["Ps"] | props["Pe"]
893          | props["Pc"] | props["Po"] | props["Pi"] | props["Pf"];
894     CodepointSet symbol = props["Sm"] | props["Sc"] | props["Sk"] | props["So"];
895     CodepointSet graphical = alpha | mark | number | punctuation | symbol | props["Zs"];
896     CodepointSet nonCharacter = props["Cn"];
897 
898 
899     writeBest3Level(sink, "alpha", alpha);
900     writeBest3Level(sink, "mark", mark);
901     writeBest3Level(sink, "number", number);
902     writeBest3Level(sink, "punctuation", punctuation);
903     writeBest3Level(sink, "symbol", symbol);
904     writeBest3Level(sink, "graphical", graphical);
905     writeBest4Level(sink, "nonCharacter", nonCharacter);
906 
907 }
908 
909 void writeNormalizationTries(File sink)
910 {
911     CodepointSet nfcQC = normalization["NFC_QCN"] | normalization["NFC_QCM"];
912     CodepointSet nfdQC = normalization["NFD_QCN"];
913     CodepointSet nfkcQC = normalization["NFKC_QCN"] | normalization["NFKC_QCM"];
914     CodepointSet nfkdQC = normalization["NFKD_QCN"];
915     writeBest3Level(sink, "nfcQC", nfcQC);
916     writeBest3Level(sink, "nfdQC", nfdQC);
917     writeBest3Level(sink, "nfkcQC", nfkcQC);
918     writeBest3Level(sink, "nfkdQC", nfkdQC);
919 }
920 
921 void writeGraphemeTries(File sink)
922 {
923     //few specifics for grapheme cluster breaking algorithm
924     //
925     auto props = general.table;
926     writeBest3Level(sink, "hangulLV", hangul.table["LV"]);
927     writeBest3Level(sink, "hangulLVT", hangul.table["LVT"]);
928 
929     // Grapheme specific information
930     writeBest3Level(sink, "prepend", graphemeBreaks.table["Prepend"]);
931     writeBest3Level(sink, "control", graphemeBreaks.table["Control"]);
932 
933     // We use Grapheme_Cluster_Break=SpacingMark instead of GC=Mc,
934     //  Grapheme_Cluster_Break=SpacingMark is derived from GC=Mc and includes other general category values
935     writeBest3Level(sink, "spacingMark", graphemeBreaks.table["SpacingMark"]);
936 
937     // We use the Grapheme_Cluster_Break=Extend instead of Grapheme_Extend,
938     //  Grapheme_Cluster_Break=Extend is derived from Grapheme_Extend and is more complete
939     writeBest3Level(sink, "graphemeExtend", graphemeBreaks.table["Extend"]);
940 
941     // emoji related data
942     writeBest3Level(sink, "Extended_Pictographic", emojiData.table["Extended_Pictographic"]);
943 
944     sink.writeln();
945 
946     writeBest3Level
947     (
948         sink,
949         "Extended_Pictographic",
950         emojiData.table["Extended_Pictographic"]
951     );
952 }
953 
954 void writeCaseCoversion(File sink)
955 {
956     {
957         sink.writefln("enum MAX_SIMPLE_LOWER = %d;", toLowerTabSimpleLen);
958         sink.writefln("enum MAX_SIMPLE_UPPER = %d;", toUpperTabSimpleLen);
959         sink.writefln("enum MAX_SIMPLE_TITLE = %d;", toTitleTabSimpleLen);
960     }
961 
962     {
963         // these are case mappings that also utilize the unconditional SpecialCasing.txt rules
964         writeBest3Level(sink, "toUpperIndex", toUpperIndex, ushort.max);
965         writeBest3Level(sink, "toLowerIndex", toLowerIndex, ushort.max);
966         writeBest3Level(sink, "toTitleIndex", toTitleIndex, ushort.max);
967     }
968 
969     {
970         // these are all case mapping tables that are 1:1 acquired from UnicodeData.txt
971         writeBest3Level(sink, "toUpperSimpleIndex", toUpperSimpleIndex, ushort.max);
972         writeBest3Level(sink, "toLowerSimpleIndex", toLowerSimpleIndex, ushort.max);
973         writeBest3Level(sink, "toTitleSimpleIndex", toTitleSimpleIndex, ushort.max);
974     }
975 
976     with(sink)
977     {
978         writeln("@property");
979         writeln("{");
980         writeln("private alias _IUA = immutable(uint[]);");
981         writefln("_IUA toUpperTable() nothrow @nogc @safe pure { static _IUA t = [%( 0x%x, %)]; return t; }", toUpperTab);
982         writefln("_IUA toLowerTable() nothrow @nogc @safe pure { static _IUA t = [%( 0x%x, %)]; return t; }", toLowerTab);
983         writefln("_IUA toTitleTable() nothrow @nogc @safe pure { static _IUA t = [%( 0x%x, %)]; return t; }", toTitleTab);
984         writeln("}");
985     }
986 }
987 
988 void writeDecomposition(File sink)
989 {
990     auto fullCanon = recursivelyDecompose(canonDecomp);
991     auto fullCompat = recursivelyDecompose(compatDecomp);
992     dstring decompCanonFlat = "\0"~array(fullCanon.values).sort.uniq.join("\0")~"\0";
993     dstring decompCompatFlat = "\0"~array(fullCompat.values).sort.uniq.join("\0")~"\0";
994     stderr.writeln("Canon flattened: ", decompCanonFlat.length);
995     stderr.writeln("Compat flattened: ", decompCompatFlat.length);
996 
997     ushort[dchar] mappingCanon;
998     ushort[dchar] mappingCompat;
999     //0 serves as doesn't decompose value
1000     foreach (k, v; fullCanon)
1001     {
1002         size_t idx = decompCanonFlat.countUntil(v~"\0");
1003         enforce(idx != 0);
1004         enforce(decompCanonFlat[idx .. idx+v.length] == v);
1005         mappingCanon[k] = cast(ushort) idx;
1006     }
1007     foreach (k, v; fullCompat)
1008     {
1009         size_t idx = decompCompatFlat.countUntil(v~"\0");
1010         enforce(idx != 0);
1011         enforce(decompCompatFlat[idx .. idx+v.length] == v);
1012         mappingCompat[k] = cast(ushort) idx;
1013     }
1014     enforce(decompCanonFlat.length < 2^^16);
1015     enforce(decompCompatFlat.length < 2^^16);
1016 
1017     //these 2 are just self-test for Trie template code
1018     auto compatRange = zip(mappingCompat.values, mappingCompat.keys).array;
1019     auto canonRange = zip(mappingCanon.values, mappingCanon.keys).array;
1020     auto compatTrie = codepointTrie!(ushort, 12, 9)(compatRange, 0);
1021     auto canonTrie =  codepointTrie!(ushort, 12, 9)(canonRange, 0);
1022     import std.string;
1023     foreach (k, v; fullCompat)
1024     {
1025         auto idx = compatTrie[k];
1026         enforce(idx == mappingCompat[k], "failed on compat");
1027         size_t len = decompCompatFlat[idx..$].countUntil(0);
1028         enforce(decompCompatFlat[idx .. idx+len] == v,
1029             format("failed on compat: '%( 0x0%5x %)' not found", v));
1030     }
1031     foreach (k, v; fullCanon)
1032     {
1033         auto idx = canonTrie[k];
1034         enforce(idx == mappingCanon[k], "failed on canon");
1035         size_t len = decompCanonFlat[idx..$].countUntil(0);
1036         enforce(decompCanonFlat[idx .. idx+len] == v,
1037             format("failed on canon: '%( 0x%5x %)' not found", v));
1038     }
1039 
1040     writeBest3Level(sink, "compatMapping", mappingCompat, cast(ushort) 0);
1041     writeBest3Level(sink, "canonMapping", mappingCanon, cast(ushort) 0);
1042     with(sink)
1043     {
1044         writeln("@property");
1045         writeln("{");
1046         writeln("private alias _IDCA = immutable(dchar[]);");
1047         writefln("_IDCA decompCanonTable() @safe pure nothrow { static _IDCA t = [%( 0x%x, %)]; return t; }", decompCanonFlat);
1048         writefln("_IDCA decompCompatTable() @safe pure nothrow { static _IDCA t = [%( 0x%x, %)]; return t; }", decompCompatFlat);
1049         writeln("}");
1050     }
1051 }
1052 
1053 void writeFunctions(File sink)
1054 {
1055     auto format = general.table["Cf"];
1056     auto space = general.table["Zs"];
1057     auto control = general.table["Cc"];
1058     auto whitespace = general.table["White_Space"];
1059 
1060     //hangul L, V, T
1061     auto hangL = hangul.table["L"];
1062     auto hangV = hangul.table["V"];
1063     auto hangT = hangul.table["T"];
1064     with(sink)
1065     {
1066         writeln(format.toSourceCode("isFormatGen"));
1067         writeln(control.toSourceCode("isControlGen"));
1068         writeln(space.toSourceCode("isSpaceGen"));
1069         writeln(whitespace.toSourceCode("isWhiteGen"));
1070         writeln(hangL.toSourceCode("isHangL"));
1071         writeln(hangV.toSourceCode("isHangV"));
1072         writeln(hangT.toSourceCode("isHangT"));
1073     }
1074 }
1075 
1076 
1077 void writeCompositionTable(File sink)
1078 {
1079     dchar[dstring] composeTab;
1080     //construct compositions table
1081     foreach (dchar k, dstring v; canonDecomp)
1082     {
1083         if (v.length != 2)//singleton
1084             continue;
1085         if (v[0] in combiningMapping) //non-starter
1086             continue;
1087         if (k in combiningMapping) //combines to non-starter
1088             continue;
1089         if (compExclusions[k]) // non-derivable exclusions
1090             continue;
1091         composeTab[v] = k;
1092     }
1093 
1094     Tuple!(dchar, dchar, dchar)[] triples;
1095     foreach (dstring key, dchar val; composeTab)
1096         triples ~= Tuple!(dchar, dchar, dchar)(key[0], key[1], val);
1097     multiSort!("a[0] < b[0]", "a[1] < b[1]")(triples);
1098     //map to the triplets array
1099     ushort[dchar] trimap;
1100     dchar old = triples[0][0];
1101     auto r = triples[];
1102     for (size_t idx = 0;;)
1103     {
1104         ptrdiff_t cnt = countUntil!(x => x[0] != old)(r);
1105         if (cnt == -1)//end of input
1106             cnt = r.length;
1107         assert(idx < 2048);
1108         assert(cnt < 32);
1109         trimap[old] = to!ushort(idx | (cnt << 11));
1110         idx += cnt;
1111         if (idx == triples.length)
1112             break;
1113         old = r[cnt][0];
1114         r = r[cnt..$];
1115     }
1116 
1117     auto triT = codepointTrie!(ushort, 12, 9)(trimap.toPairs, ushort.max);
1118     auto dupletes = triples.map!(x => tuple(x[1], x[2])).array;
1119     foreach (dstring key, dchar val; composeTab)
1120     {
1121         size_t pack = triT[key[0]];
1122         assert(pack != ushort.max);
1123         size_t idx = pack & ((1 << 11) - 1), cnt = pack >> 11;
1124         auto f = dupletes[idx .. idx+cnt].find!(x => x[0] == key[1]);
1125         assert(!f.empty);
1126         // & starts with the right value
1127         assert(f.front[1] == val);
1128     }
1129     with(sink)
1130     {
1131         writeln("enum composeIdxMask = (1 << 11) - 1, composeCntShift = 11;");
1132         write("enum compositionJumpTrieEntries = TrieEntry!(ushort, 12, 9)(");
1133         triT.store(sink.lockingTextWriter());
1134         writeln(");");
1135         writeln("@property immutable(CompEntry[]) compositionTable() nothrow pure @nogc @safe");
1136         writeln("{");
1137         writeln("alias CE = CompEntry;");
1138         write("static immutable CE[] t = [");
1139         foreach (pair; dupletes)
1140             writef("CE(0x%05x, 0x%05x),", pair[0], pair[1]);
1141         writeln("];");
1142         writeln("return t;");
1143         writeln("}");
1144     }
1145 }
1146 
1147 void writeCombining(File sink)
1148 {
1149     auto ct = codepointTrie!(ubyte, 7, 5, 9)(combiningMapping.toPairs);
1150     foreach (i, clazz; combiningClass[1 .. 255])//0 is a default for all of 1M+ codepoints
1151     {
1152         foreach (ch; clazz.byCodepoint)
1153             assert(ct[ch] == i+1);
1154     }
1155     writeBest3Level(sink, "combiningClass", combiningMapping);
1156 }
1157 
1158 //fussy compare for unicode property names as per UTS-18
1159 int comparePropertyName(Char)(const(Char)[] a, const(Char)[] b)
1160 {
1161     for (;;)
1162     {
1163         while (!a.empty && (isWhite(a.front) || a.front == '-' || a.front =='_'))
1164         {
1165             a.popFront();
1166         }
1167         while (!b.empty && (isWhite(b.front) || b.front == '-' || b.front =='_'))
1168         {
1169             b.popFront();
1170         }
1171         if (a.empty)
1172             return b.empty ? 0 : -1;
1173         if (b.empty)
1174             return 1;
1175 		// names are all in ASCII either way though whitespace might be unicode
1176         auto ca = std.ascii.toLower(a.front), cb = std.ascii.toLower(b.front);
1177         if (ca > cb)
1178             return 1;
1179         else if ( ca < cb)
1180             return -1;
1181         a.popFront();
1182         b.popFront();
1183     }
1184 }
1185 
1186 bool propertyNameLess(Char)(const(Char)[] a, const(Char)[] b)
1187 {
1188     return comparePropertyName(a, b) < 0;
1189 }
1190 
1191 //meta helpers to generate and pick the best trie by size & levels
1192 
1193 void writeBest2Level(Set)(File sink, string name, Set set)
1194     if (isCodepointSet!Set)
1195 {
1196     alias List = TypeTuple!(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1197     size_t min = size_t.max;
1198     void delegate(File) write;
1199     foreach (lvl_1; List)
1200     {
1201         enum lvl_2 = 21-lvl_1;
1202         auto t = codepointSetTrie!(lvl_1, lvl_2)(set);
1203         if (t.bytes < min)
1204         {
1205             min = t.bytes;
1206             write = createPrinter!(lvl_1, lvl_2)(name, t);
1207         }
1208     }
1209     write(sink);
1210 }
1211 
1212 void writeBest2Level(V, K)(File sink, string name, V[K] map, V defValue=V.init)
1213 {
1214     alias List = TypeTuple!(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16);
1215     size_t min = size_t.max;
1216     void delegate(File) write;
1217     auto range = zip(map.values, map.keys).array;
1218     foreach (lvl_1; List)
1219     {
1220         enum lvl_2 = 21-lvl_1;
1221         alias codepointTrie!(V, lvl_1, lvl_2) CurTrie;
1222         CurTrie t = CurTrie(range, defValue);
1223         if (t.bytes < min)
1224         {
1225             min = t.bytes;
1226             write = createPrinter!(lvl_1, lvl_2)(name, t);
1227         }
1228     }
1229     write(sink);
1230 }
1231 
1232 alias List_1 = TypeTuple!(4, 5, 6, 7, 8);
1233 
1234 auto writeBest3Level(Set)(File sink, string name, Set set)
1235     if (isCodepointSet!Set)
1236 {
1237     // access speed trumps size, power of 2 is faster to access
1238     // e.g. 9, 5, 7 is far slower then 8, 5, 8 because of how bits breakdown:
1239     // 8-5-8: indexes are 21-8 = 13 bits, 13-5 = 8 bits, fits into a byte
1240     // 9-5-7: indexes are 21-7 = 14 bits, 14-5 = 9 bits, doesn't fit into a byte (!)
1241 
1242     // e.g. 8-5-8 is one of hand picked that is a very close match
1243     // to the best packing
1244     void delegate(File) write;
1245 
1246     alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1247     size_t min = size_t.max;
1248     foreach (lvl_1; List_1)//to have the first stage index fit in byte
1249     {
1250         foreach (lvl_2; List)
1251         {
1252             static if (lvl_1 + lvl_2  <= 16)//so that 2nd stage fits in ushort
1253             {
1254                 enum lvl_3 = 21-lvl_2-lvl_1;
1255                 auto t = codepointSetTrie!(lvl_1, lvl_2, lvl_3)(set);
1256                 if (t.bytes < min)
1257                 {
1258                     min = t.bytes;
1259                     write = createPrinter!(lvl_1, lvl_2, lvl_3)(name, t);
1260                 }
1261             }
1262         }
1263     }
1264     write(sink);
1265 }
1266 
1267 void writeBest3Level(V, K)(File sink, string name, V[K] map, V defValue=V.init)
1268 {
1269     void delegate(File) write;
1270     alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);
1271     size_t min = size_t.max;
1272     auto range = zip(map.values, map.keys).array;
1273     foreach (lvl_1; List_1)//to have the first stage index fit in byte
1274     {
1275         foreach (lvl_2; List)
1276         {
1277             static if (lvl_1 + lvl_2  <= 16)// into ushort
1278             {
1279                 enum lvl_3 = 21-lvl_2-lvl_1;
1280                 auto t = codepointTrie!(V, lvl_1, lvl_2, lvl_3) (range, defValue);
1281                 if (t.bytes < min)
1282                 {
1283                     min = t.bytes;
1284                     write = createPrinter!(lvl_1, lvl_2, lvl_3)(name, t);
1285                 }
1286             }
1287         }
1288     }
1289     write(sink);
1290 }
1291 
1292 void writeBest4Level(Set)(File sink, string name, Set set)
1293 {
1294     alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13);
1295     size_t min = size_t.max;
1296     void delegate(File) write;
1297     foreach (lvl_1; List_1)//to have the first stage index fit in byte
1298     {
1299         foreach (lvl_2; List)
1300         {
1301             foreach (lvl_3; List)
1302             {
1303                 static if (lvl_1 + lvl_2 + lvl_3  <= 16)
1304                 {
1305                     enum lvl_4 = 21-lvl_3-lvl_2-lvl_1;
1306                     auto t = codepointSetTrie!(lvl_1, lvl_2, lvl_3, lvl_4)(set);
1307                     if (t.bytes < min)
1308                     {
1309                         min = t.bytes;
1310                         write = createPrinter!(lvl_1, lvl_2, lvl_3, lvl_4)(name, t);
1311                     }
1312                 }
1313             }
1314         }
1315     }
1316     write(sink);
1317 }
1318 
1319 template createPrinter(Params...)
1320 {
1321     void delegate(File) createPrinter(T)(string name, T trie)
1322     {
1323         return (File sink){
1324             sink.writef("//%d bytes\nenum %sTrieEntries = TrieEntry!(%s",
1325                 trie.bytes, name, Unqual!(typeof(T.init[0])).stringof);
1326             foreach (lvl; Params[0..$])
1327                 sink.writef(", %d", lvl);
1328             sink.write(")(");
1329             trie.store(sink.lockingTextWriter());
1330             sink.writeln(");");
1331         };
1332     }
1333 }