1 /** 2 This is a tool to automatically generate source code for unicode data structures. 3 4 If not present, the script will automatically try to download the files from: https://www.unicode.org/Public 5 6 Make sure the current working directory is the /tools folder. 7 8 To update `std.internal.unicode*.d` files, run: 9 ``` 10 rdmd -m32 unicode_table_generator.d 11 rdmd -m64 unicode_table_generator.d --min 12 ``` 13 14 The -m32 run will replace the files, while the -m64 run with --min will append 64-bit specific parts. 15 The 32-bit compilation of the generator is needed because it depends on 32-bit data structures defined in `std.uni`. 16 To make -m32 work on linux, you may need to grab a 32-bit `libphobos2.a` from `dmd2/linux/lib32` and pass it as argument: 17 18 ``` 19 rdmd -m32 -Llibphobos2.a -defaultlib= unicode_table_generator.d 20 ``` 21 22 Pull Requests to untangle this complex bootstrap process are welcome! :) 23 24 TODO: Support emitting of Turkic casefolding mappings 25 26 Authors: Dmitry Olshansky 27 28 License: Boost 29 */ 30 module std.unicode_table_generator; 31 // this shouldn't be in std package, but stuff is package'd as that in std.uni. 32 33 /// Directory in which unicode files are downloaded 34 enum unicodeDir = "ucd-15-1-0/"; 35 36 /// Url from which unicode files are downloaded 37 enum unicodeBaseUrl = "https://www.unicode.org/Public/15.1.0/ucd/"; 38 39 /// Where to put generated files 40 enum outputDir = "../std/internal/"; 41 42 import std.uni, std.stdio, std.traits, std.typetuple, 43 std.exception, std.format, std.algorithm, std.typecons, 44 std.regex, std.range, std.conv, std.getopt, std.path; 45 46 import std.file : exists; 47 static import std.ascii, std.string; 48 49 //common binary property sets and their aliases 50 struct PropertyTable 51 { 52 CodepointSet[string] table; 53 string[string] aliases; 54 } 55 56 PropertyTable general; 57 PropertyTable blocks; 58 PropertyTable scripts; 59 PropertyTable hangul; 60 PropertyTable graphemeBreaks; 61 PropertyTable emojiData; 62 63 //quick NO/MAYBE charaсter sets 64 CodepointSet[string] normalization; 65 66 //axuilary sets for case mapping 67 CodepointSet lowerCaseSet, upperCaseSet; 68 //CodepointSet titleCaseSet; //no sensible version found for isTitlecase 69 70 // sets for toLower/toUpper/toTitle 71 uint[] toLowerTab; 72 ushort toLowerTabSimpleLen; //start of long mappings 73 ushort[dchar] toLowerSimpleIndex, toLowerIndex; 74 uint[] toUpperTab; 75 ushort toUpperTabSimpleLen; //ditto for Upper 76 ushort[dchar] toUpperSimpleIndex, toUpperIndex; 77 uint[] toTitleTab; 78 ushort toTitleTabSimpleLen; //ditto for Title 79 ushort[dchar] toTitleSimpleIndex, toTitleIndex; 80 81 mixin(mixedCCEntry); 82 83 //case folding mapping 84 SimpleCaseEntry[] simpleTable; 85 FullCaseEntry[] fullTable; 86 87 ///canonical combining class 88 CodepointSet[256] combiningClass; 89 //same but packaged per dchar 90 ubyte[dchar] combiningMapping; 91 92 //unrolled decompositions 93 dstring[dchar] canonDecomp; 94 dstring[dchar] compatDecomp; 95 96 //canonical composition tables 97 dchar[] canonicalyComposableLeft; 98 dchar[] canonicalyComposableRight; 99 100 101 //canonical composition exclusions 102 CodepointSet compExclusions; 103 104 //property names to discard 105 string[] blacklist = []; 106 107 struct FullCaseEntry 108 { 109 dchar[3] seq = 0; 110 ubyte n; /// number in batch 111 ubyte size; /// size - size of batch 112 ubyte entry_len; 113 114 auto value() const @safe pure nothrow @nogc return 115 { 116 return seq[0 .. entry_len]; 117 } 118 } 119 120 /// 8 byte easy SimpleCaseEntry, will be compressed to SCE which bit packs values to 4 bytes 121 struct SimpleCaseEntry 122 { 123 uint ch; 124 ubyte n; // number in bucket 125 ubyte size; 126 bool isLower; 127 bool isUpper; 128 } 129 130 enum mixedCCEntry = ` 131 /// Simple Case Entry, wrapper around uint to extract bit fields from simpleCaseTable() 132 struct SCE 133 { 134 uint x; 135 136 nothrow @nogc pure @safe: 137 138 this(uint x) 139 { 140 this.x = x; 141 } 142 143 this(uint ch, ubyte n, ubyte size) 144 { 145 this.x = ch | n << 20 | size << 24; 146 } 147 148 int ch() const { return this.x & 0x1FFFF; } 149 int n() const { return (this.x >> 20) & 0xF; } 150 int size() const { return this.x >> 24; } 151 } 152 153 /// Bit backed FullCaseEntry 154 struct FCE 155 { 156 ulong x; // bit field sizes: 18, 12, 12, 4, 4, 4 157 158 nothrow @nogc pure @safe: 159 160 this(ulong x) 161 { 162 this.x = x; 163 } 164 165 this(dchar[3] seq, ubyte n, ubyte size, ubyte entry_len) 166 { 167 this.x = ulong(seq[0]) << 36 | ulong(seq[1]) << 24 | seq[2] << 12 | n << 8 | size << 4 | entry_len << 0; 168 } 169 170 dchar[3] seq() const { return [(x >> 36) & 0x1FFFF, (x >> 24) & 0xFFF, (x >> 12) & 0xFFF]; } 171 ubyte n() const { return (x >> 8) & 0xF; } 172 ubyte size() const { return (x >> 4) & 0xF; } 173 ubyte entry_len() const { return (x >> 0) & 0xF; } 174 } 175 176 struct UnicodeProperty 177 { 178 string name; 179 ubyte[] compressed; 180 } 181 182 struct TrieEntry(T...) 183 { 184 immutable(size_t)[] offsets; 185 immutable(size_t)[] sizes; 186 immutable(size_t)[] data; 187 } 188 189 `; 190 191 auto fullCaseEntry(dstring value, ubyte num, ubyte batch_size) 192 { 193 dchar[3] val = 0; 194 val[0 .. value.length] = value[]; 195 return FullCaseEntry(val, num, batch_size, cast(ubyte) value.length); 196 } 197 198 enum { 199 caseFoldingSrc = "CaseFolding.txt", 200 blocksSrc = "Blocks.txt", 201 propListSrc = "PropList.txt", 202 graphemeSrc = "auxiliary/GraphemeBreakProperty.txt", 203 emojiDataSrc = "emoji/emoji-data.txt", 204 propertyValueAliases = "PropertyValueAliases.txt", 205 corePropSrc = "DerivedCoreProperties.txt", 206 normalizationPropSrc = "DerivedNormalizationProps.txt", 207 scriptsSrc = "Scripts.txt", 208 hangulSyllableSrc = "HangulSyllableType.txt", 209 unicodeDataSrc = "UnicodeData.txt", 210 compositionExclusionsSrc = "CompositionExclusions.txt", 211 specialCasingSrc = "SpecialCasing.txt" 212 } 213 214 void ensureFilesAreDownloaded() 215 { 216 import std.file : write, mkdirRecurse; 217 import std.net.curl : download; 218 import std.path : dirName, buildPath; 219 220 string[] files = [ 221 caseFoldingSrc, 222 blocksSrc, 223 propListSrc, 224 graphemeSrc, 225 emojiDataSrc, 226 propertyValueAliases, 227 corePropSrc, 228 normalizationPropSrc, 229 scriptsSrc, 230 hangulSyllableSrc, 231 unicodeDataSrc, 232 compositionExclusionsSrc, 233 specialCasingSrc 234 ]; 235 236 mkdirRecurse(unicodeDir); 237 foreach (string file; files) 238 { 239 string dest = unicodeDir.buildPath(file); 240 if (exists(dest)) 241 continue; 242 if (file.canFind("/")) 243 { 244 mkdirRecurse(dest.dirName); 245 } 246 writeln("downloading ", unicodeBaseUrl ~ file); 247 download(unicodeBaseUrl ~ file, dest); 248 } 249 } 250 251 enum HeaderComment = `//Written in the D programming language 252 /** 253 * License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0). 254 * 255 * Authors: Dmitry Olshansky 256 * 257 */ 258 // !!! DO NOT EDIT !!! 259 // !!! Did you even read the comment? !!! 260 // This module is automatically generated from Unicode Character Database files 261 // https://github.com/dlang/phobos/blob/master/tools/unicode_table_generator.d 262 //dfmt off`; 263 264 auto toPairs(K, V)(V[K] aa) 265 { 266 return aa.values.zip(aa.keys).array; 267 } 268 269 void main(string[] argv) 270 { 271 string mode = "w"; 272 273 bool minimal = false; 274 getopt(argv, "min", &minimal); 275 if (minimal) 276 mode = "a"; 277 278 ensureFilesAreDownloaded(); 279 280 auto baseSink = File(outputDir.buildPath("unicode_tables.d"), mode); 281 auto compSink = File(outputDir.buildPath("unicode_comp.d"), mode); 282 auto decompSink = File(outputDir.buildPath("unicode_decomp.d"), mode); 283 auto normSink = File(outputDir.buildPath("unicode_norm.d"), mode); 284 auto graphSink = File(outputDir.buildPath("unicode_grapheme.d"), mode); 285 if (!minimal) 286 { 287 baseSink.writeln(HeaderComment); 288 baseSink.writeln("module std.internal.unicode_tables;"); 289 baseSink.writeln("\n@safe pure nothrow @nogc package(std):\n"); 290 291 compSink.writeln("module std.internal.unicode_comp;"); 292 compSink.writeln("import std.internal.unicode_tables;"); 293 compSink.writeln("\n@safe pure nothrow @nogc package(std):\n"); 294 295 decompSink.writeln(HeaderComment); 296 decompSink.writeln("module std.internal.unicode_decomp;"); 297 decompSink.writeln("import std.internal.unicode_tables;"); 298 decompSink.writeln("\n@safe pure nothrow @nogc package(std):\n"); 299 300 normSink.writeln(HeaderComment); 301 normSink.writeln("module std.internal.unicode_norm;"); 302 normSink.writeln("import std.internal.unicode_tables;"); 303 normSink.writeln("\npackage(std):\n"); 304 305 graphSink.writeln(HeaderComment); 306 graphSink.writeln("module std.internal.unicode_grapheme;"); 307 graphSink.writeln("import std.internal.unicode_tables;"); 308 graphSink.writeln("\npackage(std):\n"); 309 } 310 311 loadBlocks(unicodeDir.buildPath(blocksSrc), blocks); 312 loadProperties(unicodeDir.buildPath(propListSrc), general); 313 loadProperties(unicodeDir.buildPath(corePropSrc), general); 314 loadProperties(unicodeDir.buildPath(scriptsSrc), scripts); 315 loadProperties(unicodeDir.buildPath(hangulSyllableSrc), hangul); 316 loadProperties(unicodeDir.buildPath(graphemeSrc), graphemeBreaks); 317 loadProperties(unicodeDir.buildPath(emojiDataSrc), emojiData); 318 loadPropertyAliases(unicodeDir.buildPath(propertyValueAliases)); 319 320 loadUnicodeData(unicodeDir.buildPath(unicodeDataSrc)); 321 loadSpecialCasing(unicodeDir.buildPath(specialCasingSrc)); 322 loadExclusions(unicodeDir.buildPath(compositionExclusionsSrc)); 323 loadCaseFolding(unicodeDir.buildPath(caseFoldingSrc)); 324 loadNormalization(unicodeDir.buildPath(normalizationPropSrc)); 325 326 static void writeTableOfSets(File sink, string prefix, PropertyTable tab) 327 { 328 sink.writeln(); 329 writeAliasTable(sink, prefix, tab); 330 } 331 332 if (!minimal) 333 { 334 writeCaseFolding(baseSink); 335 writeTableOfSets(baseSink, "uniProps", general); 336 writeTableOfSets(baseSink, "blocks", blocks); 337 writeTableOfSets(baseSink, "scripts", scripts); 338 writeTableOfSets(baseSink, "hangul", hangul); 339 writeFunctions(baseSink); 340 } 341 342 static void trieProlog(File file) 343 { 344 file.writefln("\nstatic if (size_t.sizeof == %d)\n{", size_t.sizeof); 345 } 346 347 static void trieEpilog(File file) 348 { 349 file.writeln(" 350 }\n"); 351 } 352 353 trieProlog(baseSink); 354 trieProlog(compSink); 355 trieProlog(decompSink); 356 trieProlog(graphSink); 357 trieProlog(normSink); 358 359 writeTries(baseSink); 360 writeNormalizationTries(normSink); 361 writeGraphemeTries(graphSink); 362 writeCaseCoversion(baseSink); 363 writeCombining(compSink); 364 writeDecomposition(decompSink); 365 writeCompositionTable(compSink); 366 367 trieEpilog(decompSink); 368 trieEpilog(compSink); 369 trieEpilog(baseSink); 370 trieEpilog(graphSink); 371 trieEpilog(normSink); 372 } 373 374 void scanUniData(alias Fn)(string name, Regex!char r) 375 { 376 File f = File(name); 377 foreach (line; f.byLine) 378 { 379 auto m = match(line, r); 380 if (!m.empty) 381 Fn(m); 382 } 383 } 384 385 void loadCaseFolding(string f) 386 { 387 dchar[dchar] simple; 388 dstring[dchar] full; 389 390 auto r = regex("([^;]*); ([CFST]);\\s*([^;]*);"); 391 scanUniData!((m){ 392 auto s1 = m.captures[1]; 393 auto code = m.captures[2].front; 394 auto s2 = m.captures[3]; 395 auto left = parse!int(s1, 16); 396 if (code == 'C') 397 { 398 auto right = parse!int(s2, 16); 399 simple[left] = right; 400 full[left] = [right]; 401 } 402 else if (code == 'S') 403 { 404 auto right = parse!int(s2, 16); 405 simple[left] = right; 406 } 407 else if (code == 'F') 408 { 409 dstring right; 410 foreach (x; match(s2, regex("[0-9A-Fa-f]+", "g"))) 411 { 412 right ~= to!int(x[0], 16); 413 } 414 full[left] = right.idup; 415 } 416 else if (code == 'T') 417 { 418 // TODO: ignore Turkic languages for now. 419 } 420 })(f, r); 421 422 //make some useful sets by hand 423 424 lowerCaseSet = general.table["Lowercase"]; 425 upperCaseSet = general.table["Uppercase"]; 426 //titleCaseSet = general.table["Lt"]; 427 428 foreach (ch; simple.keys()) 429 { 430 dchar[8] entry; 431 int size=0; 432 entry[size++] = ch; 433 dchar x = simple[ch]; 434 entry[size++] = x; 435 //simple is many:1 mapping 436 foreach (key, v; simple) 437 { 438 if (v == x && !canFind(entry[], key)) 439 { 440 entry[size++] = key; 441 } 442 } 443 sort(entry[0 .. size]); 444 foreach (i, value; entry[0 .. size]) 445 { 446 simpleTable ~= SimpleCaseEntry( 447 value, 448 cast(ubyte) i, 449 cast(ubyte) size, 450 cast(bool) (value in lowerCaseSet), 451 cast(bool) (value in upperCaseSet) 452 ); 453 } 454 } 455 456 foreach (ch; full.keys()) 457 { 458 dstring[8] entry; 459 int size=0; 460 entry[size++] = [ch]; 461 auto x = full[ch]; 462 entry[size++] = x; 463 464 //full is many:many mapping 465 //sort single-char versions, and let them come first 466 foreach (key, v; full) 467 { 468 if (v == x && !canFind(entry[], [key])) 469 { 470 entry[size++] = [key]; 471 } 472 473 } 474 auto right = partition!(a => a.length == 1)(entry[0 .. size]); 475 sort(entry[0 .. size - right.length]); 476 foreach (i, value; entry[0 .. size]) 477 { 478 fullTable ~= fullCaseEntry(value, cast(ubyte) i, cast(ubyte) size); 479 } 480 } 481 } 482 483 void loadBlocks(string f, ref PropertyTable target) 484 { 485 auto r = regex(`^([0-9A-F]+)\.\.([0-9A-F]+);\s*(.*)\s*$`); 486 scanUniData!((m){ 487 auto s1 = m.captures[1]; 488 auto s2 = m.captures[2]; 489 auto a1 = parse!uint(s1, 16); 490 auto a2 = parse!uint(s2, 16); 491 //@@@BUG 6178 memory corruption with 492 //target[to!string(m.captures[3])] = CodepointSet(a1, a2+1); 493 auto set = CodepointSet(a1, a2+1); 494 target.table[to!string(m.captures[3])] = set; 495 })(f, r); 496 } 497 498 void loadProperties(string inp, ref PropertyTable target) 499 { 500 auto acceptProp = (string name) => countUntil(blacklist, name) < 0 && !name.startsWith("Changes"); 501 auto r = regex(`^(?:(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*([a-zA-Z_0-9]*)\s*#|# [a-zA-Z_0-9]+=([a-zA-Z_0-9]+))`); 502 string aliasStr; 503 auto set = CodepointSet.init; //workaround @@@BUG 6178 504 scanUniData!((m){ 505 auto name = to!string(m.captures[4]); 506 if (!acceptProp(name)) 507 return; 508 if (!m.captures[5].empty) 509 aliasStr = to!string(m.captures[5]); 510 else if (!m.captures[1].empty) 511 { 512 auto sa = m.captures[1]; 513 auto sb = m.captures[2]; 514 uint a = parse!uint(sa, 16); 515 uint b = parse!uint(sb, 16); 516 if (name !in target.table) 517 { 518 target.table[name] = set; 519 } 520 auto p = name in target.table; 521 p.add(a,b+1); // unicode lists [a, b] we need [a,b) 522 if (!aliasStr.empty) 523 { 524 target.aliases[name] = aliasStr; 525 aliasStr = ""; 526 } 527 } 528 else if (!m.captures[3].empty) 529 { 530 auto sx = m.captures[3]; 531 uint x = parse!uint(sx, 16); 532 if (name !in target.table) 533 { 534 target.table[name] = set; 535 } 536 auto p = name in target.table; 537 *p |= x; 538 if (!aliasStr.empty) 539 { 540 target.aliases[name] = aliasStr; 541 aliasStr = ""; 542 } 543 } 544 })(inp, r); 545 } 546 547 void loadPropertyAliases(string inp) 548 { 549 auto r = regex(`^([\w0-9_]+)\s*;\s*([\w0-9_]+)\s*;\s*([\w0-9_]+)`); 550 scanUniData!((m){ 551 auto type = m.captures[1]; 552 auto target = m.captures[2].idup; 553 auto label = m.captures[3].idup; 554 if (target != label) 555 { 556 if (type == "blk") 557 blocks.aliases[target] = label; 558 else if (type == "gc") 559 general.aliases[target] = label; 560 else if (type == "sc") 561 scripts.aliases[target] = label; 562 } 563 })(inp, r); 564 } 565 566 void loadNormalization(string inp) 567 { 568 auto r = regex(`^(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*(NFK?[CD]_QC)\s*;\s*([NM])|#\s*[a-zA-Z_0-9]+=([a-zA-Z_0-9]+)`); 569 CodepointSet set; //workaround @@@BUG 6178 570 scanUniData!((m){ 571 auto name = to!string(m.captures[4]) ~ to!string(m.captures[5]); 572 if (!m.captures[1].empty) 573 { 574 auto sa = m.captures[1]; 575 auto sb = m.captures[2]; 576 uint a = parse!uint(sa, 16); 577 uint b = parse!uint(sb, 16); 578 if (name !in normalization) 579 { 580 normalization[name] = set; 581 } 582 auto p = name in normalization; 583 p.add(a,b+1); 584 } 585 else if (!m.captures[3].empty) 586 { 587 auto sx = m.captures[3]; 588 uint x = parse!uint(sx, 16); 589 if (name !in normalization) 590 { 591 normalization[name] = set; 592 } 593 auto p = name in normalization; 594 *p |= x; 595 } 596 })(inp, r); 597 } 598 599 void loadUnicodeData(string inp) 600 { 601 auto f = File(inp); 602 603 dchar characterStart; 604 bool haveRangeStart, haveRangeEnd; 605 606 CodepointSet all; 607 foreach (line; f.byLine) 608 { 609 auto fields = split(line, ";"); 610 //codepoint, name, General_Category, Canonical_Combining_Class, Bidi_Class, 611 //Decomp_Type&Mapping, upper case mapping, lower case mapping, title case mapping 612 auto codepoint = fields[0]; 613 auto decomp = fields[5]; 614 auto name = fields[1]; 615 616 dchar src = parse!uint(codepoint, 16); 617 618 if (name.endsWith("First>")) 619 { 620 assert(!haveRangeStart); 621 characterStart = src; 622 haveRangeStart = true; 623 continue; 624 } 625 else if (name.endsWith("Last>")) 626 { 627 haveRangeStart = false; 628 haveRangeEnd = true; 629 } 630 else 631 { 632 assert(!haveRangeStart); 633 haveRangeEnd = false; 634 characterStart = src; 635 } 636 637 auto generalCategory = fields[2]; 638 auto ccc = parse!uint(fields[3]); 639 auto upperCasePart = fields[12]; 640 auto lowerCasePart = fields[13]; 641 auto titleCasePart = fields[14]; 642 643 void appendCaseTab(ref ushort[dchar] index, ref uint[] chars, char[] casePart) 644 { 645 if (!casePart.empty) 646 { 647 // if you have a range, you shouldn't have any casing provided 648 assert(!haveRangeEnd); 649 650 uint ch = parse!uint(casePart, 16); 651 chars ~= ch; 652 assert(chars.length < ushort.max); 653 index[src] = cast(ushort)(chars.length-1); 654 } 655 } 656 657 if (generalCategory !in general.table) 658 general.table[generalCategory.idup] = CodepointSet.init; 659 660 all.add(characterStart, src+1); 661 combiningClass[ccc].add(characterStart, src+1); 662 general.table[generalCategory].add(characterStart, src+1); 663 664 appendCaseTab(toLowerSimpleIndex, toLowerTab, lowerCasePart); 665 appendCaseTab(toUpperSimpleIndex, toUpperTab, upperCasePart); 666 appendCaseTab(toTitleSimpleIndex, toTitleTab, titleCasePart); 667 668 if (!decomp.empty) 669 { 670 // none of the ranges in UnicodeData.txt have decompositions provided 671 assert(!haveRangeEnd); 672 673 //stderr.writeln(codepoint, " ---> ", decomp); 674 675 dstring dest; 676 bool compat = false; 677 if (decomp.startsWith(" ")) 678 decomp = decomp[1..$]; 679 if (decomp.front == '<') 680 { 681 decomp = findSplitAfter(decomp, ">")[1]; 682 compat = true; 683 } 684 auto vals = split(decomp, " "); 685 foreach (v; vals) 686 { 687 if (!v.empty) 688 dest ~= cast(dchar) parse!uint(v, 16); 689 } 690 if (!compat) 691 { 692 assert(dest.length <= 2, "cannonical decomposition has more then 2 codepoints?!"); 693 canonDecomp[src] = dest; 694 } 695 compatDecomp[src] = dest; 696 } 697 } 698 // compute Cn as all dchar we have not found in UnicodeData.txt 699 general.table["Cn"] = all.inverted; 700 general.aliases["Cn"] = "Unassigned"; 701 auto arr = combiningClass[1 .. 255]; 702 foreach (i, clazz; arr)//0 is a default for all of 1M+ codepoints 703 { 704 auto y = clazz.byCodepoint; 705 foreach (ch; y) 706 combiningMapping[ch] = cast(ubyte)(i+1); 707 } 708 } 709 710 void loadSpecialCasing(string f) 711 { 712 { 713 toLowerTabSimpleLen = cast(ushort) toLowerTab.length; 714 toUpperTabSimpleLen = cast(ushort) toUpperTab.length; 715 toTitleTabSimpleLen = cast(ushort) toTitleTab.length; 716 717 // duplicate the simple indexes prior to adding our uncondtional rules also 718 toLowerIndex = toLowerSimpleIndex.dup; 719 toTitleIndex = toTitleSimpleIndex.dup; 720 toUpperIndex = toUpperSimpleIndex.dup; 721 } 722 723 auto file = File(f); 724 auto r = regex(`([0-9A-F]+(?:\s*[0-9A-F]+)+);`, "g"); 725 foreach (line; file.byLine) 726 { 727 if (!line.empty && line[0] == '#') 728 { 729 if (line.canFind("Conditional Mappings")) 730 { 731 // TODO: we kinda need the conditional mappings for languages like Turkish 732 break; 733 } 734 else 735 continue; 736 } 737 738 auto entries = line.match(r); 739 if (entries.empty) 740 continue; 741 auto pieces = array(entries.map!"a[1]"); 742 dchar ch = parse!uint(pieces[0], 16); 743 void processPiece(ref ushort[dchar] index, ref uint[] table, char[] piece) 744 { 745 uint[] mapped = piece.split 746 .map!(x=>parse!uint(x, 16)).array; 747 if (mapped.length == 1) 748 { 749 table ~= mapped[0]; 750 index[ch] = cast(ushort)(table.length - 1); 751 } 752 else 753 { 754 ushort idx = cast(ushort) table.length; 755 table ~= mapped; 756 table[idx] |= (mapped.length << 24); //upper 8bits - length of sequence 757 index[ch] = idx; 758 } 759 } 760 761 // lower, title, upper 762 processPiece(toLowerIndex, toLowerTab, pieces[1]); 763 processPiece(toTitleIndex, toTitleTab, pieces[2]); 764 processPiece(toUpperIndex, toUpperTab, pieces[3]); 765 } 766 } 767 768 auto recursivelyDecompose(dstring[dchar] decompTable) 769 { 770 //apply recursively: 771 dstring[dchar] full; 772 foreach (k, v; decompTable) 773 { 774 dstring old, decomp=v; 775 do 776 { 777 old = decomp; 778 decomp = ""; 779 foreach (dchar ch; old) 780 { 781 if (ch in decompTable) 782 decomp ~= decompTable[ch]; 783 else 784 decomp ~= ch; 785 } 786 } 787 while (old != decomp); 788 full[k] = decomp; 789 } 790 return full; 791 } 792 793 void loadExclusions(string inp) 794 { 795 auto r = regex(`^([0-9A-F]+)`); 796 scanUniData!((m){ 797 auto piece = m.captures[1]; 798 uint a = parse!uint(piece, 16); 799 compExclusions |= cast(dchar) a; 800 })(inp, r); 801 } 802 803 string charsetString(CodepointSet set, string sep=";\n") 804 { 805 auto app = appender!(char[])(); 806 ubyte[] data = compressIntervals(set.byInterval); 807 assert(CodepointSet(decompressIntervals(data)) == set); 808 formattedWrite(app, "x\"%(%02X%)\";", data); 809 return cast(string) app.data; 810 } 811 812 string identName(string s) 813 { 814 auto app = appender!(char[])(); 815 foreach (c; s) 816 { 817 if (c == '-' || c == ' ') 818 app.put('_'); 819 else 820 app.put(c); 821 } 822 return cast(string) app.data; 823 } 824 825 string uniformName(string s) 826 { 827 auto app = appender!(char[])(); 828 foreach (c; s) 829 { 830 if (c != '-' && c != ' ' && c != '_') 831 app.put(std.ascii.toLower(c)); 832 } 833 return cast(string) app.data; 834 } 835 836 void writeSets(File sink, PropertyTable src) 837 { 838 with(sink) 839 { 840 writeln("private alias _T = ubyte[];"); 841 foreach (k, v; src.table) 842 { 843 writef("_T %s = ", identName(k)); 844 writeln(charsetString(v)); 845 } 846 } 847 } 848 849 void writeAliasTable(File sink, string prefix, PropertyTable src) 850 { 851 with(sink) 852 { 853 writeln("struct ", prefix); 854 writeln("{"); 855 writeln("private alias _U = immutable(UnicodeProperty);"); 856 writeln("@property static _U[] tab() pure { return _tab; }"); 857 writeln("static immutable:"); 858 writeSets(sink, src); 859 writeln("_U[] _tab = ["); 860 } 861 string[] lines; 862 string[] namesOnly; 863 auto app = appender!(char[])(); 864 auto keys = src.table.keys; 865 foreach (k; keys) 866 { 867 formattedWrite(app, "_U(\"%s\", %s),\n", k, identName(k)); 868 lines ~= app.data.idup; 869 namesOnly ~= uniformName(k); 870 app.shrinkTo(0); 871 if (k in src.aliases) 872 { 873 formattedWrite(app, "_U(\"%s\", %s),\n", src.aliases[k], identName(k)); 874 lines ~= app.data.idup; 875 namesOnly ~= uniformName(src.aliases[k]); 876 app.shrinkTo(0); 877 } 878 } 879 static bool ucmp(T)(T a, T b) { return propertyNameLess(a[0], b[0]); } 880 sort!ucmp(zip(namesOnly, lines)); 881 882 with(sink) 883 { 884 foreach (i, v; lines) 885 { 886 write(lines[i]); 887 } 888 writeln("];"); 889 writeln("}"); 890 } 891 } 892 893 void writeCaseFolding(File sink) 894 { 895 with(sink) 896 { 897 write(mixedCCEntry); 898 899 writeln("SCE simpleCaseTable(size_t i)"); 900 writeln("{"); 901 writef("static immutable uint[] t = x\""); 902 foreach (i, v; simpleTable) 903 { 904 if (i % 12 == 0) writeln(); 905 writef("%08X", SCE(v.ch, v.n, v.size).x); 906 } 907 908 // Inspect max integer size, so efficient bit packing can be found: 909 stderr.writefln("max n: %X", simpleTable.maxElement!(x => x.n).n); // n: 2-bit 910 stderr.writefln("max ch: %X", simpleTable.maxElement!(x => x.ch).ch); // ch: 17-bit 911 stderr.writefln("max size: %X", simpleTable.maxElement!(x => x.size).size); // size: 3-bit 912 913 writeln("\";"); 914 writeln("return SCE(t[i]);"); 915 writeln("}"); 916 writeln("@property FCE fullCaseTable(size_t index) nothrow @nogc @safe pure"); 917 writeln("{"); 918 writef("static immutable ulong[] t = x\""); 919 int[4] maxS = 0; 920 foreach (i, v; fullTable) 921 { 922 foreach (j; 0 .. v.entry_len) 923 maxS[j] = max(maxS[j], v.value[j]); 924 925 if (v.entry_len > 1) 926 { 927 assert(v.n >= 1); // meaning that start of bucket is always single char 928 } 929 if (i % 6 == 0) writeln(); 930 writef("%016X", FCE(v.seq, v.n, v.size, v.entry_len).x); 931 } 932 writeln("\";"); 933 writeln("return FCE(t[index]);"); 934 writeln("}"); 935 import core.bitop : bsr; 936 stderr.writefln("max seq bits: [%d, %d, %d]", 1 + bsr(maxS[0]), 1 + bsr(maxS[1]), 1 + bsr(maxS[2])); //[17, 11, 10] 937 stderr.writefln("max n = %d", fullTable.map!(x => x.n).maxElement); // 3 938 stderr.writefln("max size = %d", fullTable.map!(x => x.size).maxElement); // 4 939 stderr.writefln("max entry_len = %d", fullTable.map!(x => x.entry_len).maxElement); // 3 940 } 941 } 942 943 void writeTries(File sink) 944 { 945 ushort[dchar] simpleIndices; 946 foreach (i, v; array(map!(x => x.ch)(simpleTable))) 947 simpleIndices[v] = cast(ushort) i; 948 949 ushort[dchar] fullIndices; 950 foreach (i, v; fullTable) 951 { 952 if (v.entry_len == 1) 953 fullIndices[v.seq[0]] = cast(ushort) i; 954 } 955 956 //these 2 only for verification of Trie code itself 957 auto st = codepointTrie!(ushort, 12, 9)( 958 zip(simpleIndices.values, simpleIndices.keys).array, ushort.max); 959 auto ft = codepointTrie!(ushort, 12, 9)( 960 zip(fullIndices.values, fullIndices.keys).array, ushort.max); 961 962 foreach (k, v; simpleIndices) 963 { 964 assert(st[k] == simpleIndices[k]); 965 } 966 967 foreach (k, v; fullIndices) 968 { 969 assert(ft[k] == fullIndices[k]); 970 } 971 972 writeBest3Level(sink, "lowerCase", lowerCaseSet); 973 writeBest3Level(sink, "upperCase", upperCaseSet); 974 //writeBest3Level("titleCase", titleCaseSet); 975 writeBest3Level(sink, "simpleCase", simpleIndices, ushort.max); 976 writeBest3Level(sink, "fullCase", fullIndices, ushort.max); 977 978 //common isXXX properties 979 auto props = general.table; 980 CodepointSet alpha = props["Alphabetic"]; //it includes some numbers, symbols & marks 981 CodepointSet mark = props["Mn"] | props["Me"] | props["Mc"]; 982 CodepointSet number = props["Nd"] | props["Nl"] | props["No"]; 983 CodepointSet punctuation = props["Pd"] | props["Ps"] | props["Pe"] 984 | props["Pc"] | props["Po"] | props["Pi"] | props["Pf"]; 985 CodepointSet symbol = props["Sm"] | props["Sc"] | props["Sk"] | props["So"]; 986 CodepointSet graphical = alpha | mark | number | punctuation | symbol | props["Zs"]; 987 CodepointSet nonCharacter = props["Cn"]; 988 989 990 writeBest3Level(sink, "alpha", alpha); 991 writeBest3Level(sink, "mark", mark); 992 writeBest3Level(sink, "number", number); 993 writeBest3Level(sink, "punctuation", punctuation); 994 writeBest3Level(sink, "symbol", symbol); 995 writeBest3Level(sink, "graphical", graphical); 996 writeBest4Level(sink, "nonCharacter", nonCharacter); 997 998 } 999 1000 void writeNormalizationTries(File sink) 1001 { 1002 CodepointSet nfcQC = normalization["NFC_QCN"] | normalization["NFC_QCM"]; 1003 CodepointSet nfdQC = normalization["NFD_QCN"]; 1004 CodepointSet nfkcQC = normalization["NFKC_QCN"] | normalization["NFKC_QCM"]; 1005 CodepointSet nfkdQC = normalization["NFKD_QCN"]; 1006 writeBest3Level(sink, "nfcQC", nfcQC); 1007 writeBest3Level(sink, "nfdQC", nfdQC); 1008 writeBest3Level(sink, "nfkcQC", nfkcQC); 1009 writeBest3Level(sink, "nfkdQC", nfkdQC); 1010 } 1011 1012 void writeGraphemeTries(File sink) 1013 { 1014 //few specifics for grapheme cluster breaking algorithm 1015 // 1016 auto props = general.table; 1017 writeBest3Level(sink, "hangulLV", hangul.table["LV"]); 1018 writeBest3Level(sink, "hangulLVT", hangul.table["LVT"]); 1019 1020 // Grapheme specific information 1021 writeBest3Level(sink, "prepend", graphemeBreaks.table["Prepend"]); 1022 writeBest3Level(sink, "control", graphemeBreaks.table["Control"]); 1023 1024 // We use Grapheme_Cluster_Break=SpacingMark instead of GC=Mc, 1025 // Grapheme_Cluster_Break=SpacingMark is derived from GC=Mc and includes other general category values 1026 writeBest3Level(sink, "spacingMark", graphemeBreaks.table["SpacingMark"]); 1027 1028 // We use the Grapheme_Cluster_Break=Extend instead of Grapheme_Extend, 1029 // Grapheme_Cluster_Break=Extend is derived from Grapheme_Extend and is more complete 1030 writeBest3Level(sink, "graphemeExtend", graphemeBreaks.table["Extend"]); 1031 1032 // emoji related data 1033 writeBest3Level(sink, "Extended_Pictographic", emojiData.table["Extended_Pictographic"]); 1034 } 1035 1036 /// Write a function that returns a dchar[] with data stored in `table` 1037 void writeDstringTable(File sink, string name, const dchar[] table) 1038 { 1039 sink.writefln("dstring %s() nothrow @nogc pure @safe {\nstatic immutable dchar[%d] t =", name, table.length); 1040 sink.writeDstring(table); 1041 sink.writeln(";\nreturn t[];\n}"); 1042 } 1043 1044 /// Write a function that returns a uint[] with data stored in `table` 1045 void writeUintTable(File sink, string name, const uint[] table) 1046 { 1047 sink.writefln("immutable(uint)[] %s() nothrow @nogc pure @safe {\nstatic immutable uint[] t =", name, ); 1048 sink.writeUintArray(table); 1049 sink.writeln(";\nreturn t;\n}"); 1050 } 1051 1052 void writeCaseCoversion(File sink) 1053 { 1054 { 1055 sink.writefln("enum MAX_SIMPLE_LOWER = %d;", toLowerTabSimpleLen); 1056 sink.writefln("enum MAX_SIMPLE_UPPER = %d;", toUpperTabSimpleLen); 1057 sink.writefln("enum MAX_SIMPLE_TITLE = %d;", toTitleTabSimpleLen); 1058 } 1059 1060 { 1061 // these are case mappings that also utilize the unconditional SpecialCasing.txt rules 1062 writeBest3Level(sink, "toUpperIndex", toUpperIndex, ushort.max); 1063 writeBest3Level(sink, "toLowerIndex", toLowerIndex, ushort.max); 1064 writeBest3Level(sink, "toTitleIndex", toTitleIndex, ushort.max); 1065 } 1066 1067 { 1068 // these are all case mapping tables that are 1:1 acquired from UnicodeData.txt 1069 writeBest3Level(sink, "toUpperSimpleIndex", toUpperSimpleIndex, ushort.max); 1070 writeBest3Level(sink, "toLowerSimpleIndex", toLowerSimpleIndex, ushort.max); 1071 writeBest3Level(sink, "toTitleSimpleIndex", toTitleSimpleIndex, ushort.max); 1072 } 1073 1074 writeUintTable(sink, "toUpperTable", toUpperTab); 1075 writeUintTable(sink, "toLowerTable", toLowerTab); 1076 writeUintTable(sink, "toTitleTable", toTitleTab); 1077 } 1078 1079 void writeDecomposition(File sink) 1080 { 1081 auto fullCanon = recursivelyDecompose(canonDecomp); 1082 auto fullCompat = recursivelyDecompose(compatDecomp); 1083 dstring decompCanonFlat = "\0"~array(fullCanon.values).sort.uniq.join("\0")~"\0"; 1084 dstring decompCompatFlat = "\0"~array(fullCompat.values).sort.uniq.join("\0")~"\0"; 1085 stderr.writeln("Canon flattened: ", decompCanonFlat.length); 1086 stderr.writeln("Compat flattened: ", decompCompatFlat.length); 1087 1088 ushort[dchar] mappingCanon; 1089 ushort[dchar] mappingCompat; 1090 //0 serves as doesn't decompose value 1091 foreach (k, v; fullCanon) 1092 { 1093 size_t idx = decompCanonFlat.countUntil(v~"\0"); 1094 enforce(idx != 0); 1095 enforce(decompCanonFlat[idx .. idx+v.length] == v); 1096 mappingCanon[k] = cast(ushort) idx; 1097 } 1098 foreach (k, v; fullCompat) 1099 { 1100 size_t idx = decompCompatFlat.countUntil(v~"\0"); 1101 enforce(idx != 0); 1102 enforce(decompCompatFlat[idx .. idx+v.length] == v); 1103 mappingCompat[k] = cast(ushort) idx; 1104 } 1105 enforce(decompCanonFlat.length < 2^^16); 1106 enforce(decompCompatFlat.length < 2^^16); 1107 1108 //these 2 are just self-test for Trie template code 1109 auto compatRange = zip(mappingCompat.values, mappingCompat.keys).array; 1110 auto canonRange = zip(mappingCanon.values, mappingCanon.keys).array; 1111 auto compatTrie = codepointTrie!(ushort, 12, 9)(compatRange, 0); 1112 auto canonTrie = codepointTrie!(ushort, 12, 9)(canonRange, 0); 1113 import std.string; 1114 foreach (k, v; fullCompat) 1115 { 1116 auto idx = compatTrie[k]; 1117 enforce(idx == mappingCompat[k], "failed on compat"); 1118 size_t len = decompCompatFlat[idx..$].countUntil(0); 1119 enforce(decompCompatFlat[idx .. idx+len] == v, 1120 format("failed on compat: '%( 0x0%5x %)' not found", v)); 1121 } 1122 foreach (k, v; fullCanon) 1123 { 1124 auto idx = canonTrie[k]; 1125 enforce(idx == mappingCanon[k], "failed on canon"); 1126 size_t len = decompCanonFlat[idx..$].countUntil(0); 1127 enforce(decompCanonFlat[idx .. idx+len] == v, 1128 format("failed on canon: '%( 0x%5x %)' not found", v)); 1129 } 1130 1131 writeBest3Level(sink, "compatMapping", mappingCompat, cast(ushort) 0); 1132 writeBest3Level(sink, "canonMapping", mappingCanon, cast(ushort) 0); 1133 1134 writeDstringTable(sink, "decompCanonTable", decompCanonFlat); 1135 writeDstringTable(sink, "decompCompatTable", decompCompatFlat); 1136 } 1137 1138 void writeFunctions(File sink) 1139 { 1140 auto format = general.table["Cf"]; 1141 auto space = general.table["Zs"]; 1142 auto control = general.table["Cc"]; 1143 auto whitespace = general.table["White_Space"]; 1144 1145 //hangul L, V, T 1146 auto hangL = hangul.table["L"]; 1147 auto hangV = hangul.table["V"]; 1148 auto hangT = hangul.table["T"]; 1149 with(sink) 1150 { 1151 writeln(format.toSourceCode("isFormatGen")); 1152 writeln(control.toSourceCode("isControlGen")); 1153 writeln(space.toSourceCode("isSpaceGen")); 1154 writeln(whitespace.toSourceCode("isWhiteGen")); 1155 writeln(hangL.toSourceCode("isHangL")); 1156 writeln(hangV.toSourceCode("isHangV")); 1157 writeln(hangT.toSourceCode("isHangT")); 1158 } 1159 } 1160 1161 /// Write a `dchar[]` as hex string 1162 void writeUintArray(T:dchar)(File sink, const T[] tab) 1163 { 1164 size_t lineCount = 1; 1165 sink.write("x\""); 1166 foreach (i, elem; tab) 1167 { 1168 if ((i % 12) == 0) 1169 sink.write("\n"); 1170 1171 sink.writef("%08X", elem); 1172 } 1173 sink.write("\""); 1174 } 1175 1176 /// Write a `dchar[]` as a dstring ""d 1177 void writeDstring(T:dchar)(File sink, const T[] tab) 1178 { 1179 size_t lineCount = 1; 1180 sink.write("\""); 1181 foreach (elem; tab) 1182 { 1183 if (lineCount >= 110) 1184 { 1185 sink.write("\"d~\n\""); 1186 lineCount = 1; 1187 } 1188 if (elem >= 0x10FFFF) 1189 { 1190 // invalid dchar, but might have extra info bit-packed in upper bits 1191 sink.writef("\"d~cast(dchar) 0x%08X~\"", elem); 1192 lineCount += 25; 1193 } 1194 else if (elem <= 0xFFFF) 1195 { 1196 sink.writef("\\u%04X", elem); 1197 lineCount += 6; 1198 } 1199 else 1200 { 1201 sink.writef("\\U%08X", elem); 1202 lineCount += 10; 1203 } 1204 } 1205 sink.write("\"d"); 1206 } 1207 1208 1209 void writeCompositionTable(File sink) 1210 { 1211 dchar[dstring] composeTab; 1212 //construct compositions table 1213 foreach (dchar k, dstring v; canonDecomp) 1214 { 1215 if (v.length != 2)//singleton 1216 continue; 1217 if (v[0] in combiningMapping) //non-starter 1218 continue; 1219 if (k in combiningMapping) //combines to non-starter 1220 continue; 1221 if (compExclusions[k]) // non-derivable exclusions 1222 continue; 1223 composeTab[v] = k; 1224 } 1225 1226 Tuple!(dchar, dchar, dchar)[] triples; 1227 foreach (dstring key, dchar val; composeTab) 1228 triples ~= Tuple!(dchar, dchar, dchar)(key[0], key[1], val); 1229 multiSort!("a[0] < b[0]", "a[1] < b[1]")(triples); 1230 //map to the triplets array 1231 ushort[dchar] trimap; 1232 dchar old = triples[0][0]; 1233 auto r = triples[]; 1234 for (size_t idx = 0;;) 1235 { 1236 ptrdiff_t cnt = countUntil!(x => x[0] != old)(r); 1237 if (cnt == -1)//end of input 1238 cnt = r.length; 1239 assert(idx < 2048); 1240 assert(cnt < 32); 1241 trimap[old] = to!ushort(idx | (cnt << 11)); 1242 idx += cnt; 1243 if (idx == triples.length) 1244 break; 1245 old = r[cnt][0]; 1246 r = r[cnt..$]; 1247 } 1248 1249 auto triT = codepointTrie!(ushort, 12, 9)(trimap.toPairs, ushort.max); 1250 auto dupletes = triples.map!(x => tuple(x[1], x[2])).array; 1251 foreach (dstring key, dchar val; composeTab) 1252 { 1253 size_t pack = triT[key[0]]; 1254 assert(pack != ushort.max); 1255 size_t idx = pack & ((1 << 11) - 1), cnt = pack >> 11; 1256 auto f = dupletes[idx .. idx+cnt].find!(x => x[0] == key[1]); 1257 assert(!f.empty); 1258 // & starts with the right value 1259 assert(f.front[1] == val); 1260 } 1261 with(sink) 1262 { 1263 writeln("enum composeIdxMask = (1 << 11) - 1, composeCntShift = 11;"); 1264 write("enum compositionJumpTrieEntries = TrieEntry!(ushort, 12, 9)("); 1265 triT.storeTrie(sink.lockingTextWriter()); 1266 writeln(");"); 1267 writeDstringTable(sink, "compositionTable", dupletes.map!(x => only(x.expand)).joiner.array); 1268 } 1269 } 1270 1271 void writeCombining(File sink) 1272 { 1273 auto ct = codepointTrie!(ubyte, 7, 5, 9)(combiningMapping.toPairs); 1274 foreach (i, clazz; combiningClass[1 .. 255])//0 is a default for all of 1M+ codepoints 1275 { 1276 foreach (ch; clazz.byCodepoint) 1277 assert(ct[ch] == i+1); 1278 } 1279 writeBest3Level(sink, "combiningClass", combiningMapping); 1280 } 1281 1282 //fussy compare for unicode property names as per UTS-18 1283 int comparePropertyName(Char)(const(Char)[] a, const(Char)[] b) 1284 { 1285 for (;;) 1286 { 1287 while (!a.empty && (isWhite(a.front) || a.front == '-' || a.front =='_')) 1288 { 1289 a.popFront(); 1290 } 1291 while (!b.empty && (isWhite(b.front) || b.front == '-' || b.front =='_')) 1292 { 1293 b.popFront(); 1294 } 1295 if (a.empty) 1296 return b.empty ? 0 : -1; 1297 if (b.empty) 1298 return 1; 1299 // names are all in ASCII either way though whitespace might be unicode 1300 auto ca = std.ascii.toLower(a.front), cb = std.ascii.toLower(b.front); 1301 if (ca > cb) 1302 return 1; 1303 else if ( ca < cb) 1304 return -1; 1305 a.popFront(); 1306 b.popFront(); 1307 } 1308 } 1309 1310 bool propertyNameLess(Char)(const(Char)[] a, const(Char)[] b) 1311 { 1312 return comparePropertyName(a, b) < 0; 1313 } 1314 1315 //meta helpers to generate and pick the best trie by size & levels 1316 1317 void writeBest2Level(Set)(File sink, string name, Set set) 1318 if (isCodepointSet!Set) 1319 { 1320 alias List = TypeTuple!(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 1321 size_t min = size_t.max; 1322 void delegate(File) write; 1323 foreach (lvl_1; List) 1324 { 1325 enum lvl_2 = 21-lvl_1; 1326 auto t = codepointSetTrie!(lvl_1, lvl_2)(set); 1327 if (t.bytes < min) 1328 { 1329 min = t.bytes; 1330 write = createPrinter!(lvl_1, lvl_2)(name, t); 1331 } 1332 } 1333 write(sink); 1334 } 1335 1336 void writeBest2Level(V, K)(File sink, string name, V[K] map, V defValue=V.init) 1337 { 1338 alias List = TypeTuple!(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 1339 size_t min = size_t.max; 1340 void delegate(File) write; 1341 auto range = zip(map.values, map.keys).array; 1342 foreach (lvl_1; List) 1343 { 1344 enum lvl_2 = 21-lvl_1; 1345 alias codepointTrie!(V, lvl_1, lvl_2) CurTrie; 1346 CurTrie t = CurTrie(range, defValue); 1347 if (t.bytes < min) 1348 { 1349 min = t.bytes; 1350 write = createPrinter!(lvl_1, lvl_2)(name, t); 1351 } 1352 } 1353 write(sink); 1354 } 1355 1356 alias List_1 = TypeTuple!(4, 5, 6, 7, 8); 1357 1358 auto writeBest3Level(Set)(File sink, string name, Set set) 1359 if (isCodepointSet!Set) 1360 { 1361 // access speed trumps size, power of 2 is faster to access 1362 // e.g. 9, 5, 7 is far slower then 8, 5, 8 because of how bits breakdown: 1363 // 8-5-8: indexes are 21-8 = 13 bits, 13-5 = 8 bits, fits into a byte 1364 // 9-5-7: indexes are 21-7 = 14 bits, 14-5 = 9 bits, doesn't fit into a byte (!) 1365 1366 // e.g. 8-5-8 is one of hand picked that is a very close match 1367 // to the best packing 1368 void delegate(File) write; 1369 1370 alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1371 size_t min = size_t.max; 1372 foreach (lvl_1; List_1)//to have the first stage index fit in byte 1373 { 1374 foreach (lvl_2; List) 1375 { 1376 static if (lvl_1 + lvl_2 <= 16)//so that 2nd stage fits in ushort 1377 { 1378 enum lvl_3 = 21-lvl_2-lvl_1; 1379 auto t = codepointSetTrie!(lvl_1, lvl_2, lvl_3)(set); 1380 if (t.bytes < min) 1381 { 1382 min = t.bytes; 1383 write = createPrinter!(lvl_1, lvl_2, lvl_3)(name, t); 1384 } 1385 } 1386 } 1387 } 1388 write(sink); 1389 } 1390 1391 void writeBest3Level(V, K)(File sink, string name, V[K] map, V defValue=V.init) 1392 { 1393 void delegate(File) write; 1394 alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1395 size_t min = size_t.max; 1396 auto range = zip(map.values, map.keys).array; 1397 foreach (lvl_1; List_1)//to have the first stage index fit in byte 1398 { 1399 foreach (lvl_2; List) 1400 { 1401 static if (lvl_1 + lvl_2 <= 16)// into ushort 1402 { 1403 enum lvl_3 = 21-lvl_2-lvl_1; 1404 auto t = codepointTrie!(V, lvl_1, lvl_2, lvl_3) (range, defValue); 1405 if (t.bytes < min) 1406 { 1407 min = t.bytes; 1408 write = createPrinter!(lvl_1, lvl_2, lvl_3)(name, t); 1409 } 1410 } 1411 } 1412 } 1413 write(sink); 1414 } 1415 1416 void writeBest4Level(Set)(File sink, string name, Set set) 1417 { 1418 alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1419 size_t min = size_t.max; 1420 void delegate(File) write; 1421 foreach (lvl_1; List_1)//to have the first stage index fit in byte 1422 { 1423 foreach (lvl_2; List) 1424 { 1425 foreach (lvl_3; List) 1426 { 1427 static if (lvl_1 + lvl_2 + lvl_3 <= 16) 1428 { 1429 enum lvl_4 = 21-lvl_3-lvl_2-lvl_1; 1430 auto t = codepointSetTrie!(lvl_1, lvl_2, lvl_3, lvl_4)(set); 1431 if (t.bytes < min) 1432 { 1433 min = t.bytes; 1434 write = createPrinter!(lvl_1, lvl_2, lvl_3, lvl_4)(name, t); 1435 } 1436 } 1437 } 1438 } 1439 } 1440 write(sink); 1441 } 1442 1443 template createPrinter(Params...) 1444 { 1445 void delegate(File) createPrinter(T)(string name, T trie) 1446 { 1447 return (File sink){ 1448 sink.writef("//%d bytes\nenum %sTrieEntries = TrieEntry!(%s", 1449 trie.bytes, name, Unqual!(typeof(T.init[0])).stringof); 1450 foreach (lvl; Params[0..$]) 1451 sink.writef(", %d", lvl); 1452 sink.write(")("); 1453 trie.storeTrie(sink.lockingTextWriter()); 1454 sink.writeln(");"); 1455 }; 1456 } 1457 } 1458 1459 void storeTrie(T, O)(T trie, O sink) 1460 { 1461 import std.format.write : formattedWrite; 1462 void store(size_t[] arr) 1463 { 1464 formattedWrite(sink, "x\""); 1465 foreach (i; 0 .. arr.length) 1466 { 1467 static if (size_t.sizeof == 8) 1468 { 1469 formattedWrite(sink, (i % 6) == 0 ? "\n" : ""); 1470 formattedWrite(sink, "%016X", arr[i]); 1471 } 1472 else 1473 { 1474 formattedWrite(sink, (i % 12) == 0 ? "\n" : ""); 1475 formattedWrite(sink, "%08X", arr[i]); 1476 } 1477 } 1478 formattedWrite(sink, "\",\n"); 1479 } 1480 // Access private members 1481 auto table = __traits(getMember, trie, "_table"); 1482 store(__traits(getMember, table, "offsets")); 1483 store(__traits(getMember, table, "sz")); 1484 store(__traits(getMember, table, "storage")); 1485 }