1 //Written in the D programming language 2 /** 3 gen_uni is a tool to automatically generate source code for unicode data structures. 4 5 To generate the tables use: 6 ``` 7 $ rdmd -m32 unicode_table_generator.d 8 $ rdmd -m64 unicode_table_generator.d --min 9 ``` 10 11 See the global ``UnicodeDatabaseDirectory`` for the latest version of the Unicode database that was used to generate the tables. 12 13 TODO: Support emitting of Turkic casefolding mappings 14 15 Authors: Dmitry Olshansky 16 17 License: Boost 18 */ 19 module std.unicode_table_generator; 20 // this shouldn't be in std package, but stuff is package'd as that in std.uni. 21 22 import std.uni, std.stdio, std.traits, std.typetuple, 23 std.exception, std.format, std.algorithm, std.typecons, 24 std.regex, std.range, std.conv, std.getopt; 25 26 import std.file : exists; 27 static import std.ascii, std.string; 28 29 //common binary property sets and their aliases 30 struct PropertyTable 31 { 32 CodepointSet[string] table; 33 string[string] aliases; 34 } 35 36 PropertyTable general; 37 PropertyTable blocks; 38 PropertyTable scripts; 39 PropertyTable hangul; 40 PropertyTable graphemeBreaks; 41 PropertyTable emojiData; 42 43 //quick NO/MAYBE charaсter sets 44 CodepointSet[string] normalization; 45 46 //axuilary sets for case mapping 47 CodepointSet lowerCaseSet, upperCaseSet; 48 //CodepointSet titleCaseSet; //no sensible version found for isTitlecase 49 50 // sets for toLower/toUpper/toTitle 51 uint[] toLowerTab; 52 ushort toLowerTabSimpleLen; //start of long mappings 53 ushort[dchar] toLowerSimpleIndex, toLowerIndex; 54 uint[] toUpperTab; 55 ushort toUpperTabSimpleLen; //ditto for Upper 56 ushort[dchar] toUpperSimpleIndex, toUpperIndex; 57 uint[] toTitleTab; 58 ushort toTitleTabSimpleLen; //ditto for Title 59 ushort[dchar] toTitleSimpleIndex, toTitleIndex; 60 61 mixin(mixedCCEntry); 62 63 //case folding mapping 64 SimpleCaseEntry[] simpleTable; 65 FullCaseEntry[] fullTable; 66 67 ///canonical combining class 68 CodepointSet[256] combiningClass; 69 //same but packaged per dchar 70 ubyte[dchar] combiningMapping; 71 72 //unrolled decompositions 73 dstring[dchar] canonDecomp; 74 dstring[dchar] compatDecomp; 75 76 //canonical composition tables 77 dchar[] canonicalyComposableLeft; 78 dchar[] canonicalyComposableRight; 79 80 81 //canonical composition exclusions 82 CodepointSet compExclusions; 83 84 //property names to discard 85 string[] blacklist = []; 86 87 enum mixedCCEntry = ` 88 struct SimpleCaseEntry 89 { 90 uint ch; 91 ubyte n, bucket;// n - number in bucket 92 93 pure nothrow @nogc: 94 95 @property ubyte size() const 96 { 97 return bucket & 0x3F; 98 } 99 @property auto isLower() const 100 { 101 return bucket & 0x40; 102 } 103 @property auto isUpper() const 104 { 105 return bucket & 0x80; 106 } 107 } 108 109 struct FullCaseEntry 110 { 111 dchar[3] seq; 112 ubyte n, size;// n number in batch, size - size of batch 113 ubyte entry_len; 114 115 @property auto value() const @trusted pure nothrow @nogc return 116 { 117 return seq[0 .. entry_len]; 118 } 119 } 120 121 struct CompEntry 122 { 123 dchar rhs, composed; 124 } 125 126 struct UnicodeProperty 127 { 128 string name; 129 ubyte[] compressed; 130 } 131 132 struct TrieEntry(T...) 133 { 134 size_t[] offsets; 135 size_t[] sizes; 136 size_t[] data; 137 } 138 139 `; 140 141 auto fullCaseEntry(dstring value, ubyte num, ubyte batch_size) 142 { 143 dchar[3] val; 144 val[0 .. value.length] = value[]; 145 return FullCaseEntry(val, num, batch_size, cast(ubyte) value.length); 146 } 147 148 enum { 149 UnicodeDatabaseDirectory = "ucd-15/", 150 caseFoldingSrc = UnicodeDatabaseDirectory ~ "CaseFolding.txt", 151 blocksSrc = UnicodeDatabaseDirectory ~ "Blocks.txt", 152 propListSrc = UnicodeDatabaseDirectory ~ "PropList.txt", 153 graphemeSrc = UnicodeDatabaseDirectory ~ "auxiliary/GraphemeBreakProperty.txt", 154 emojiDataSrc = UnicodeDatabaseDirectory ~ "emoji/emoji-data.txt", 155 propertyValueAliases = UnicodeDatabaseDirectory ~ "PropertyValueAliases.txt", 156 corePropSrc = UnicodeDatabaseDirectory ~ "DerivedCoreProperties.txt", 157 normalizationPropSrc = UnicodeDatabaseDirectory ~ "DerivedNormalizationProps.txt", 158 scriptsSrc = UnicodeDatabaseDirectory ~ "Scripts.txt", 159 hangulSyllableSrc = UnicodeDatabaseDirectory ~ "HangulSyllableType.txt", 160 unicodeDataSrc = UnicodeDatabaseDirectory ~ "UnicodeData.txt", 161 compositionExclusionsSrc = UnicodeDatabaseDirectory ~ "CompositionExclusions.txt", 162 specialCasingSrc = UnicodeDatabaseDirectory ~ "SpecialCasing.txt" 163 } 164 165 enum HeaderComment = `//Written in the D programming language 166 /** 167 * License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0). 168 * 169 * Authors: Dmitry Olshansky 170 * 171 */ 172 // !!! DO NOT EDIT !!! 173 // !!! Did you even read the comment? !!! 174 // This module is automatically generated from Unicode Character Database files 175 // https://github.com/dlang/phobos/blob/master/tools/unicode_table_generator.d 176 //dfmt off`; 177 178 auto toPairs(K, V)(V[K] aa) 179 { 180 return aa.values.zip(aa.keys).array; 181 } 182 183 void main(string[] argv) 184 { 185 string mode = "w"; 186 187 bool minimal = false; 188 getopt(argv, "min", &minimal); 189 if (minimal) 190 mode = "a"; 191 192 193 if (!exists(UnicodeDatabaseDirectory)) 194 { 195 writeln("Did you forget to download the Unicode database tables?"); 196 writeln("Looking for them in: ", UnicodeDatabaseDirectory); 197 return; 198 } 199 200 enum UnicodeTableDirectory = "../std/internal/"; 201 202 auto baseSink = File(UnicodeTableDirectory ~ "unicode_tables.d", mode); 203 auto compSink = File(UnicodeTableDirectory ~ "unicode_comp.d", mode); 204 auto decompSink = File(UnicodeTableDirectory ~ "unicode_decomp.d", mode); 205 auto normSink = File(UnicodeTableDirectory ~ "unicode_norm.d", mode); 206 auto graphSink = File(UnicodeTableDirectory ~ "unicode_grapheme.d", mode); 207 if (!minimal) 208 { 209 baseSink.writeln(HeaderComment); 210 baseSink.writeln("module std.internal.unicode_tables;"); 211 baseSink.writeln("\n@safe pure nothrow @nogc package(std):\n"); 212 213 compSink.writeln("module std.internal.unicode_comp;"); 214 compSink.writeln("import std.internal.unicode_tables;"); 215 compSink.writeln("\n@safe pure nothrow @nogc package(std):\n"); 216 217 decompSink.writeln(HeaderComment); 218 decompSink.writeln("module std.internal.unicode_decomp;"); 219 decompSink.writeln("import std.internal.unicode_tables;"); 220 decompSink.writeln("\n@safe pure nothrow @nogc package(std):\n"); 221 222 normSink.writeln(HeaderComment); 223 normSink.writeln("module std.internal.unicode_norm;"); 224 normSink.writeln("import std.internal.unicode_tables;"); 225 normSink.writeln("\npackage(std):\n"); 226 227 graphSink.writeln(HeaderComment); 228 graphSink.writeln("module std.internal.unicode_grapheme;"); 229 graphSink.writeln("import std.internal.unicode_tables;"); 230 graphSink.writeln("\npackage(std):\n"); 231 } 232 233 loadBlocks(blocksSrc, blocks); 234 loadProperties(propListSrc, general); 235 loadProperties(corePropSrc, general); 236 loadProperties(scriptsSrc, scripts); 237 loadProperties(hangulSyllableSrc, hangul); 238 loadProperties(graphemeSrc, graphemeBreaks); 239 loadProperties(emojiDataSrc, emojiData); 240 loadPropertyAliases(propertyValueAliases); 241 242 loadUnicodeData(unicodeDataSrc); 243 loadSpecialCasing(specialCasingSrc); 244 loadExclusions(compositionExclusionsSrc); 245 loadCaseFolding(caseFoldingSrc); 246 loadNormalization(normalizationPropSrc); 247 248 static void writeTableOfSets(File sink, string prefix, PropertyTable tab) 249 { 250 sink.writeln(); 251 writeAliasTable(sink, prefix, tab); 252 } 253 254 if (!minimal) 255 { 256 writeCaseFolding(baseSink); 257 writeTableOfSets(baseSink, "uniProps", general); 258 writeTableOfSets(baseSink, "blocks", blocks); 259 writeTableOfSets(baseSink, "scripts", scripts); 260 writeTableOfSets(baseSink, "hangul", hangul); 261 writeFunctions(baseSink); 262 } 263 264 static void trieProlog(File file) 265 { 266 file.writefln("\nstatic if (size_t.sizeof == %d)\n{", size_t.sizeof); 267 } 268 269 static void trieEpilog(File file) 270 { 271 file.writeln(" 272 }\n"); 273 } 274 275 trieProlog(baseSink); 276 trieProlog(compSink); 277 trieProlog(decompSink); 278 trieProlog(graphSink); 279 trieProlog(normSink); 280 281 writeTries(baseSink); 282 writeNormalizationTries(normSink); 283 writeGraphemeTries(graphSink); 284 writeCaseCoversion(baseSink); 285 writeCombining(compSink); 286 writeDecomposition(decompSink); 287 writeCompositionTable(compSink); 288 289 trieEpilog(decompSink); 290 trieEpilog(compSink); 291 trieEpilog(baseSink); 292 trieEpilog(graphSink); 293 trieEpilog(normSink); 294 } 295 296 void scanUniData(alias Fn)(string name, Regex!char r) 297 { 298 File f = File(name); 299 foreach (line; f.byLine) 300 { 301 auto m = match(line, r); 302 if (!m.empty) 303 Fn(m); 304 } 305 } 306 307 void loadCaseFolding(string f) 308 { 309 dchar[dchar] simple; 310 dstring[dchar] full; 311 312 auto r = regex("([^;]*); ([CFST]);\\s*([^;]*);"); 313 scanUniData!((m){ 314 auto s1 = m.captures[1]; 315 auto code = m.captures[2].front; 316 auto s2 = m.captures[3]; 317 auto left = parse!int(s1, 16); 318 if (code == 'C') 319 { 320 auto right = parse!int(s2, 16); 321 simple[left] = right; 322 full[left] = [right]; 323 } 324 else if (code == 'S') 325 { 326 auto right = parse!int(s2, 16); 327 simple[left] = right; 328 } 329 else if (code == 'F') 330 { 331 dstring right; 332 foreach (x; match(s2, regex("[0-9A-Fa-f]+", "g"))) 333 { 334 right ~= to!int(x[0], 16); 335 } 336 full[left] = right.idup; 337 } 338 else if (code == 'T') 339 { 340 // TODO: ignore Turkic languages for now. 341 } 342 })(f, r); 343 344 //make some useful sets by hand 345 346 lowerCaseSet = general.table["Lowercase"]; 347 upperCaseSet = general.table["Uppercase"]; 348 //titleCaseSet = general.table["Lt"]; 349 350 foreach (ch; simple.keys()) 351 { 352 dchar[8] entry; 353 int size=0; 354 entry[size++] = ch; 355 dchar x = simple[ch]; 356 entry[size++] = x; 357 //simple is many:1 mapping 358 foreach (key, v; simple) 359 { 360 if (v == x && !canFind(entry[], key)) 361 { 362 entry[size++] = key; 363 } 364 } 365 sort(entry[0 .. size]); 366 foreach (i, value; entry[0 .. size]) 367 { 368 auto withFlags = cast(ubyte) size | (value in lowerCaseSet ? 0x40 : 0) 369 | (value in upperCaseSet ? 0x80 : 0); 370 simpleTable ~= SimpleCaseEntry(value, cast(ubyte) i, 371 cast(ubyte) withFlags); 372 } 373 } 374 375 foreach (ch; full.keys()) 376 { 377 dstring[8] entry; 378 int size=0; 379 entry[size++] = [ch]; 380 auto x = full[ch]; 381 entry[size++] = x; 382 383 //full is many:many mapping 384 //sort single-char versions, and let them come first 385 foreach (key, v; full) 386 { 387 if (v == x && !canFind(entry[], [key])) 388 { 389 entry[size++] = [key]; 390 } 391 392 } 393 auto right = partition!(a => a.length == 1)(entry[0 .. size]); 394 sort(entry[0 .. size - right.length]); 395 foreach (i, value; entry[0 .. size]) 396 { 397 fullTable ~= fullCaseEntry(value, cast(ubyte) i, cast(ubyte) size); 398 } 399 } 400 } 401 402 void loadBlocks(string f, ref PropertyTable target) 403 { 404 auto r = regex(`^([0-9A-F]+)\.\.([0-9A-F]+);\s*(.*)\s*$`); 405 scanUniData!((m){ 406 auto s1 = m.captures[1]; 407 auto s2 = m.captures[2]; 408 auto a1 = parse!uint(s1, 16); 409 auto a2 = parse!uint(s2, 16); 410 //@@@BUG 6178 memory corruption with 411 //target[to!string(m.captures[3])] = CodepointSet(a1, a2+1); 412 auto set = CodepointSet(a1, a2+1); 413 target.table[to!string(m.captures[3])] = set; 414 })(f, r); 415 } 416 417 void loadProperties(string inp, ref PropertyTable target) 418 { 419 auto acceptProp = (string name) => countUntil(blacklist, name) < 0 && !name.startsWith("Changes"); 420 auto r = regex(`^(?:(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*([a-zA-Z_0-9]*)\s*#|# [a-zA-Z_0-9]+=([a-zA-Z_0-9]+))`); 421 string aliasStr; 422 auto set = CodepointSet.init; //workaround @@@BUG 6178 423 scanUniData!((m){ 424 auto name = to!string(m.captures[4]); 425 if (!acceptProp(name)) 426 return; 427 if (!m.captures[5].empty) 428 aliasStr = to!string(m.captures[5]); 429 else if (!m.captures[1].empty) 430 { 431 auto sa = m.captures[1]; 432 auto sb = m.captures[2]; 433 uint a = parse!uint(sa, 16); 434 uint b = parse!uint(sb, 16); 435 if (name !in target.table) 436 { 437 target.table[name] = set; 438 } 439 auto p = name in target.table; 440 p.add(a,b+1); // unicode lists [a, b] we need [a,b) 441 if (!aliasStr.empty) 442 { 443 target.aliases[name] = aliasStr; 444 aliasStr = ""; 445 } 446 } 447 else if (!m.captures[3].empty) 448 { 449 auto sx = m.captures[3]; 450 uint x = parse!uint(sx, 16); 451 if (name !in target.table) 452 { 453 target.table[name] = set; 454 } 455 auto p = name in target.table; 456 *p |= x; 457 if (!aliasStr.empty) 458 { 459 target.aliases[name] = aliasStr; 460 aliasStr = ""; 461 } 462 } 463 })(inp, r); 464 } 465 466 void loadPropertyAliases(string inp) 467 { 468 auto r = regex(`^([\w0-9_]+)\s*;\s*([\w0-9_]+)\s*;\s*([\w0-9_]+)`); 469 scanUniData!((m){ 470 auto type = m.captures[1]; 471 auto target = m.captures[2].idup; 472 auto label = m.captures[3].idup; 473 if (target != label) 474 { 475 if (type == "blk") 476 blocks.aliases[target] = label; 477 else if (type == "gc") 478 general.aliases[target] = label; 479 else if (type == "sc") 480 scripts.aliases[target] = label; 481 } 482 })(inp, r); 483 } 484 485 void loadNormalization(string inp) 486 { 487 auto r = regex(`^(?:([0-9A-F]+)\.\.([0-9A-F]+)|([0-9A-F]+))\s*;\s*(NFK?[CD]_QC)\s*;\s*([NM])|#\s*[a-zA-Z_0-9]+=([a-zA-Z_0-9]+)`); 488 CodepointSet set; //workaround @@@BUG 6178 489 scanUniData!((m){ 490 auto name = to!string(m.captures[4]) ~ to!string(m.captures[5]); 491 if (!m.captures[1].empty) 492 { 493 auto sa = m.captures[1]; 494 auto sb = m.captures[2]; 495 uint a = parse!uint(sa, 16); 496 uint b = parse!uint(sb, 16); 497 if (name !in normalization) 498 { 499 normalization[name] = set; 500 } 501 auto p = name in normalization; 502 p.add(a,b+1); 503 } 504 else if (!m.captures[3].empty) 505 { 506 auto sx = m.captures[3]; 507 uint x = parse!uint(sx, 16); 508 if (name !in normalization) 509 { 510 normalization[name] = set; 511 } 512 auto p = name in normalization; 513 *p |= x; 514 } 515 })(inp, r); 516 } 517 518 void loadUnicodeData(string inp) 519 { 520 auto f = File(inp); 521 522 dchar characterStart; 523 bool haveRangeStart, haveRangeEnd; 524 525 CodepointSet all; 526 foreach (line; f.byLine) 527 { 528 auto fields = split(line, ";"); 529 //codepoint, name, General_Category, Canonical_Combining_Class, Bidi_Class, 530 //Decomp_Type&Mapping, upper case mapping, lower case mapping, title case mapping 531 auto codepoint = fields[0]; 532 auto decomp = fields[5]; 533 auto name = fields[1]; 534 535 dchar src = parse!uint(codepoint, 16); 536 537 if (name.endsWith("First>")) 538 { 539 assert(!haveRangeStart); 540 characterStart = src; 541 haveRangeStart = true; 542 continue; 543 } 544 else if (name.endsWith("Last>")) 545 { 546 haveRangeStart = false; 547 haveRangeEnd = true; 548 } 549 else 550 { 551 assert(!haveRangeStart); 552 haveRangeEnd = false; 553 characterStart = src; 554 } 555 556 auto generalCategory = fields[2]; 557 auto ccc = parse!uint(fields[3]); 558 auto upperCasePart = fields[12]; 559 auto lowerCasePart = fields[13]; 560 auto titleCasePart = fields[14]; 561 562 void appendCaseTab(ref ushort[dchar] index, ref uint[] chars, char[] casePart) 563 { 564 if (!casePart.empty) 565 { 566 // if you have a range, you shouldn't have any casing provided 567 assert(!haveRangeEnd); 568 569 uint ch = parse!uint(casePart, 16); 570 chars ~= ch; 571 assert(chars.length < ushort.max); 572 index[src] = cast(ushort)(chars.length-1); 573 } 574 } 575 576 if (generalCategory !in general.table) 577 general.table[generalCategory.idup] = CodepointSet.init; 578 579 all.add(characterStart, src+1); 580 combiningClass[ccc].add(characterStart, src+1); 581 general.table[generalCategory].add(characterStart, src+1); 582 583 appendCaseTab(toLowerSimpleIndex, toLowerTab, lowerCasePart); 584 appendCaseTab(toUpperSimpleIndex, toUpperTab, upperCasePart); 585 appendCaseTab(toTitleSimpleIndex, toTitleTab, titleCasePart); 586 587 if (!decomp.empty) 588 { 589 // none of the ranges in UnicodeData.txt have decompositions provided 590 assert(!haveRangeEnd); 591 592 //stderr.writeln(codepoint, " ---> ", decomp); 593 594 dstring dest; 595 bool compat = false; 596 if (decomp.startsWith(" ")) 597 decomp = decomp[1..$]; 598 if (decomp.front == '<') 599 { 600 decomp = findSplitAfter(decomp, ">")[1]; 601 compat = true; 602 } 603 auto vals = split(decomp, " "); 604 foreach (v; vals) 605 { 606 if (!v.empty) 607 dest ~= cast(dchar) parse!uint(v, 16); 608 } 609 if (!compat) 610 { 611 assert(dest.length <= 2, "cannonical decomposition has more then 2 codepoints?!"); 612 canonDecomp[src] = dest; 613 } 614 compatDecomp[src] = dest; 615 } 616 } 617 // compute Cn as all dchar we have not found in UnicodeData.txt 618 general.table["Cn"] = all.inverted; 619 general.aliases["Cn"] = "Unassigned"; 620 auto arr = combiningClass[1 .. 255]; 621 foreach (i, clazz; arr)//0 is a default for all of 1M+ codepoints 622 { 623 auto y = clazz.byCodepoint; 624 foreach (ch; y) 625 combiningMapping[ch] = cast(ubyte)(i+1); 626 } 627 } 628 629 void loadSpecialCasing(string f) 630 { 631 { 632 toLowerTabSimpleLen = cast(ushort) toLowerTab.length; 633 toUpperTabSimpleLen = cast(ushort) toUpperTab.length; 634 toTitleTabSimpleLen = cast(ushort) toTitleTab.length; 635 636 // duplicate the simple indexes prior to adding our uncondtional rules also 637 toLowerIndex = toLowerSimpleIndex.dup; 638 toTitleIndex = toTitleSimpleIndex.dup; 639 toUpperIndex = toUpperSimpleIndex.dup; 640 } 641 642 auto file = File(f); 643 auto r = regex(`([0-9A-F]+(?:\s*[0-9A-F]+)+);`, "g"); 644 foreach (line; file.byLine) 645 { 646 if (!line.empty && line[0] == '#') 647 { 648 if (line.canFind("Conditional Mappings")) 649 { 650 // TODO: we kinda need the conditional mappings for languages like Turkish 651 break; 652 } 653 else 654 continue; 655 } 656 657 auto entries = line.match(r); 658 if (entries.empty) 659 continue; 660 auto pieces = array(entries.map!"a[1]"); 661 dchar ch = parse!uint(pieces[0], 16); 662 void processPiece(ref ushort[dchar] index, ref uint[] table, char[] piece) 663 { 664 uint[] mapped = piece.split 665 .map!(x=>parse!uint(x, 16)).array; 666 if (mapped.length == 1) 667 { 668 table ~= mapped[0]; 669 index[ch] = cast(ushort)(table.length - 1); 670 } 671 else 672 { 673 ushort idx = cast(ushort) table.length; 674 table ~= mapped; 675 table[idx] |= (mapped.length << 24); //upper 8bits - length of sequence 676 index[ch] = idx; 677 } 678 } 679 680 // lower, title, upper 681 processPiece(toLowerIndex, toLowerTab, pieces[1]); 682 processPiece(toTitleIndex, toTitleTab, pieces[2]); 683 processPiece(toUpperIndex, toUpperTab, pieces[3]); 684 } 685 } 686 687 auto recursivelyDecompose(dstring[dchar] decompTable) 688 { 689 //apply recursively: 690 dstring[dchar] full; 691 foreach (k, v; decompTable) 692 { 693 dstring old, decomp=v; 694 do 695 { 696 old = decomp; 697 decomp = ""; 698 foreach (dchar ch; old) 699 { 700 if (ch in decompTable) 701 decomp ~= decompTable[ch]; 702 else 703 decomp ~= ch; 704 } 705 } 706 while (old != decomp); 707 full[k] = decomp; 708 } 709 return full; 710 } 711 712 void loadExclusions(string inp) 713 { 714 auto r = regex(`^([0-9A-F]+)`); 715 scanUniData!((m){ 716 auto piece = m.captures[1]; 717 uint a = parse!uint(piece, 16); 718 compExclusions |= cast(dchar) a; 719 })(inp, r); 720 } 721 722 string charsetString(CodepointSet set, string sep=";\n") 723 { 724 auto app = appender!(char[])(); 725 ubyte[] data = compressIntervals(set.byInterval); 726 assert(CodepointSet(decompressIntervals(data)) == set); 727 formattedWrite(app, "[%(0x%x, %)];", data); 728 return cast(string) app.data; 729 } 730 731 string identName(string s) 732 { 733 auto app = appender!(char[])(); 734 foreach (c; s) 735 { 736 if (c == '-' || c == ' ') 737 app.put('_'); 738 else 739 app.put(c); 740 } 741 return cast(string) app.data; 742 } 743 744 string uniformName(string s) 745 { 746 auto app = appender!(char[])(); 747 foreach (c; s) 748 { 749 if (c != '-' && c != ' ' && c != '_') 750 app.put(std.ascii.toLower(c)); 751 } 752 return cast(string) app.data; 753 } 754 755 void writeSets(File sink, PropertyTable src) 756 { 757 with(sink) 758 { 759 writeln("private alias _T = ubyte[];"); 760 foreach (k, v; src.table) 761 { 762 writef("_T %s = ", identName(k)); 763 writeln(charsetString(v)); 764 } 765 } 766 } 767 768 void writeAliasTable(File sink, string prefix, PropertyTable src) 769 { 770 with(sink) 771 { 772 writeln("struct ", prefix); 773 writeln("{"); 774 writeln("private alias _U = immutable(UnicodeProperty);"); 775 writeln("@property static _U[] tab() pure { return _tab; }"); 776 writeln("static immutable:"); 777 writeSets(sink, src); 778 writeln("_U[] _tab = ["); 779 } 780 string[] lines; 781 string[] namesOnly; 782 auto app = appender!(char[])(); 783 auto keys = src.table.keys; 784 foreach (k; keys) 785 { 786 formattedWrite(app, "_U(\"%s\", %s),\n", k, identName(k)); 787 lines ~= app.data.idup; 788 namesOnly ~= uniformName(k); 789 app.shrinkTo(0); 790 if (k in src.aliases) 791 { 792 formattedWrite(app, "_U(\"%s\", %s),\n", src.aliases[k], identName(k)); 793 lines ~= app.data.idup; 794 namesOnly ~= uniformName(src.aliases[k]); 795 app.shrinkTo(0); 796 } 797 } 798 static bool ucmp(T)(T a, T b) { return propertyNameLess(a[0], b[0]); } 799 sort!ucmp(zip(namesOnly, lines)); 800 801 with(sink) 802 { 803 foreach (i, v; lines) 804 { 805 write(lines[i]); 806 } 807 writeln("];"); 808 writeln("}"); 809 } 810 } 811 812 void writeCaseFolding(File sink) 813 { 814 with(sink) 815 { 816 write(mixedCCEntry); 817 818 writeln("@property immutable(SimpleCaseEntry[]) simpleCaseTable()"); 819 writeln("{"); 820 writeln("alias SCE = SimpleCaseEntry;"); 821 writeln("static immutable SCE[] t = ["); 822 foreach (i, v; simpleTable) 823 { 824 writef("SCE(0x%04x, %s, 0x%0x),", v.ch, v.n, v.bucket); 825 if (i % 4 == 0) writeln(); 826 } 827 writeln("];"); 828 writeln("return t;"); 829 writeln("}"); 830 static uint maxLen = 0; 831 writeln("@property immutable(FullCaseEntry[]) fullCaseTable() nothrow @nogc @safe pure"); 832 writeln("{"); 833 writeln("alias FCE = FullCaseEntry;"); 834 writeln("static immutable FCE[] t = ["); 835 foreach (i, v; fullTable) 836 { 837 maxLen = max(maxLen, v.entry_len); 838 if (v.entry_len > 1) 839 { 840 assert(v.n >= 1); // meaning that start of bucket is always single char 841 } 842 writef("FCE(\"%s\", %s, %s, %s),", v.value, v.n, v.size, v.entry_len); 843 if (i % 4 == 0) writeln(); 844 } 845 writeln("];"); 846 writeln("return t;"); 847 writeln("}"); 848 stderr.writefln("MAX FCF len = %d", maxLen); 849 } 850 } 851 852 void writeTries(File sink) 853 { 854 ushort[dchar] simpleIndices; 855 foreach (i, v; array(map!(x => x.ch)(simpleTable))) 856 simpleIndices[v] = cast(ushort) i; 857 858 ushort[dchar] fullIndices; 859 foreach (i, v; fullTable) 860 { 861 if (v.entry_len == 1) 862 fullIndices[v.seq[0]] = cast(ushort) i; 863 } 864 865 //these 2 only for verification of Trie code itself 866 auto st = codepointTrie!(ushort, 12, 9)( 867 zip(simpleIndices.values, simpleIndices.keys).array, ushort.max); 868 auto ft = codepointTrie!(ushort, 12, 9)( 869 zip(fullIndices.values, fullIndices.keys).array, ushort.max); 870 871 foreach (k, v; simpleIndices) 872 { 873 assert(st[k] == simpleIndices[k]); 874 } 875 876 foreach (k, v; fullIndices) 877 { 878 assert(ft[k] == fullIndices[k]); 879 } 880 881 writeBest3Level(sink, "lowerCase", lowerCaseSet); 882 writeBest3Level(sink, "upperCase", upperCaseSet); 883 //writeBest3Level("titleCase", titleCaseSet); 884 writeBest3Level(sink, "simpleCase", simpleIndices, ushort.max); 885 writeBest3Level(sink, "fullCase", fullIndices, ushort.max); 886 887 //common isXXX properties 888 auto props = general.table; 889 CodepointSet alpha = props["Alphabetic"]; //it includes some numbers, symbols & marks 890 CodepointSet mark = props["Mn"] | props["Me"] | props["Mc"]; 891 CodepointSet number = props["Nd"] | props["Nl"] | props["No"]; 892 CodepointSet punctuation = props["Pd"] | props["Ps"] | props["Pe"] 893 | props["Pc"] | props["Po"] | props["Pi"] | props["Pf"]; 894 CodepointSet symbol = props["Sm"] | props["Sc"] | props["Sk"] | props["So"]; 895 CodepointSet graphical = alpha | mark | number | punctuation | symbol | props["Zs"]; 896 CodepointSet nonCharacter = props["Cn"]; 897 898 899 writeBest3Level(sink, "alpha", alpha); 900 writeBest3Level(sink, "mark", mark); 901 writeBest3Level(sink, "number", number); 902 writeBest3Level(sink, "punctuation", punctuation); 903 writeBest3Level(sink, "symbol", symbol); 904 writeBest3Level(sink, "graphical", graphical); 905 writeBest4Level(sink, "nonCharacter", nonCharacter); 906 907 } 908 909 void writeNormalizationTries(File sink) 910 { 911 CodepointSet nfcQC = normalization["NFC_QCN"] | normalization["NFC_QCM"]; 912 CodepointSet nfdQC = normalization["NFD_QCN"]; 913 CodepointSet nfkcQC = normalization["NFKC_QCN"] | normalization["NFKC_QCM"]; 914 CodepointSet nfkdQC = normalization["NFKD_QCN"]; 915 writeBest3Level(sink, "nfcQC", nfcQC); 916 writeBest3Level(sink, "nfdQC", nfdQC); 917 writeBest3Level(sink, "nfkcQC", nfkcQC); 918 writeBest3Level(sink, "nfkdQC", nfkdQC); 919 } 920 921 void writeGraphemeTries(File sink) 922 { 923 //few specifics for grapheme cluster breaking algorithm 924 // 925 auto props = general.table; 926 writeBest3Level(sink, "hangulLV", hangul.table["LV"]); 927 writeBest3Level(sink, "hangulLVT", hangul.table["LVT"]); 928 929 // Grapheme specific information 930 writeBest3Level(sink, "prepend", graphemeBreaks.table["Prepend"]); 931 writeBest3Level(sink, "control", graphemeBreaks.table["Control"]); 932 933 // We use Grapheme_Cluster_Break=SpacingMark instead of GC=Mc, 934 // Grapheme_Cluster_Break=SpacingMark is derived from GC=Mc and includes other general category values 935 writeBest3Level(sink, "spacingMark", graphemeBreaks.table["SpacingMark"]); 936 937 // We use the Grapheme_Cluster_Break=Extend instead of Grapheme_Extend, 938 // Grapheme_Cluster_Break=Extend is derived from Grapheme_Extend and is more complete 939 writeBest3Level(sink, "graphemeExtend", graphemeBreaks.table["Extend"]); 940 941 // emoji related data 942 writeBest3Level(sink, "Extended_Pictographic", emojiData.table["Extended_Pictographic"]); 943 944 sink.writeln(); 945 946 writeBest3Level 947 ( 948 sink, 949 "Extended_Pictographic", 950 emojiData.table["Extended_Pictographic"] 951 ); 952 } 953 954 void writeCaseCoversion(File sink) 955 { 956 { 957 sink.writefln("enum MAX_SIMPLE_LOWER = %d;", toLowerTabSimpleLen); 958 sink.writefln("enum MAX_SIMPLE_UPPER = %d;", toUpperTabSimpleLen); 959 sink.writefln("enum MAX_SIMPLE_TITLE = %d;", toTitleTabSimpleLen); 960 } 961 962 { 963 // these are case mappings that also utilize the unconditional SpecialCasing.txt rules 964 writeBest3Level(sink, "toUpperIndex", toUpperIndex, ushort.max); 965 writeBest3Level(sink, "toLowerIndex", toLowerIndex, ushort.max); 966 writeBest3Level(sink, "toTitleIndex", toTitleIndex, ushort.max); 967 } 968 969 { 970 // these are all case mapping tables that are 1:1 acquired from UnicodeData.txt 971 writeBest3Level(sink, "toUpperSimpleIndex", toUpperSimpleIndex, ushort.max); 972 writeBest3Level(sink, "toLowerSimpleIndex", toLowerSimpleIndex, ushort.max); 973 writeBest3Level(sink, "toTitleSimpleIndex", toTitleSimpleIndex, ushort.max); 974 } 975 976 with(sink) 977 { 978 writeln("@property"); 979 writeln("{"); 980 writeln("private alias _IUA = immutable(uint[]);"); 981 writefln("_IUA toUpperTable() nothrow @nogc @safe pure { static _IUA t = [%( 0x%x, %)]; return t; }", toUpperTab); 982 writefln("_IUA toLowerTable() nothrow @nogc @safe pure { static _IUA t = [%( 0x%x, %)]; return t; }", toLowerTab); 983 writefln("_IUA toTitleTable() nothrow @nogc @safe pure { static _IUA t = [%( 0x%x, %)]; return t; }", toTitleTab); 984 writeln("}"); 985 } 986 } 987 988 void writeDecomposition(File sink) 989 { 990 auto fullCanon = recursivelyDecompose(canonDecomp); 991 auto fullCompat = recursivelyDecompose(compatDecomp); 992 dstring decompCanonFlat = "\0"~array(fullCanon.values).sort.uniq.join("\0")~"\0"; 993 dstring decompCompatFlat = "\0"~array(fullCompat.values).sort.uniq.join("\0")~"\0"; 994 stderr.writeln("Canon flattened: ", decompCanonFlat.length); 995 stderr.writeln("Compat flattened: ", decompCompatFlat.length); 996 997 ushort[dchar] mappingCanon; 998 ushort[dchar] mappingCompat; 999 //0 serves as doesn't decompose value 1000 foreach (k, v; fullCanon) 1001 { 1002 size_t idx = decompCanonFlat.countUntil(v~"\0"); 1003 enforce(idx != 0); 1004 enforce(decompCanonFlat[idx .. idx+v.length] == v); 1005 mappingCanon[k] = cast(ushort) idx; 1006 } 1007 foreach (k, v; fullCompat) 1008 { 1009 size_t idx = decompCompatFlat.countUntil(v~"\0"); 1010 enforce(idx != 0); 1011 enforce(decompCompatFlat[idx .. idx+v.length] == v); 1012 mappingCompat[k] = cast(ushort) idx; 1013 } 1014 enforce(decompCanonFlat.length < 2^^16); 1015 enforce(decompCompatFlat.length < 2^^16); 1016 1017 //these 2 are just self-test for Trie template code 1018 auto compatRange = zip(mappingCompat.values, mappingCompat.keys).array; 1019 auto canonRange = zip(mappingCanon.values, mappingCanon.keys).array; 1020 auto compatTrie = codepointTrie!(ushort, 12, 9)(compatRange, 0); 1021 auto canonTrie = codepointTrie!(ushort, 12, 9)(canonRange, 0); 1022 import std.string; 1023 foreach (k, v; fullCompat) 1024 { 1025 auto idx = compatTrie[k]; 1026 enforce(idx == mappingCompat[k], "failed on compat"); 1027 size_t len = decompCompatFlat[idx..$].countUntil(0); 1028 enforce(decompCompatFlat[idx .. idx+len] == v, 1029 format("failed on compat: '%( 0x0%5x %)' not found", v)); 1030 } 1031 foreach (k, v; fullCanon) 1032 { 1033 auto idx = canonTrie[k]; 1034 enforce(idx == mappingCanon[k], "failed on canon"); 1035 size_t len = decompCanonFlat[idx..$].countUntil(0); 1036 enforce(decompCanonFlat[idx .. idx+len] == v, 1037 format("failed on canon: '%( 0x%5x %)' not found", v)); 1038 } 1039 1040 writeBest3Level(sink, "compatMapping", mappingCompat, cast(ushort) 0); 1041 writeBest3Level(sink, "canonMapping", mappingCanon, cast(ushort) 0); 1042 with(sink) 1043 { 1044 writeln("@property"); 1045 writeln("{"); 1046 writeln("private alias _IDCA = immutable(dchar[]);"); 1047 writefln("_IDCA decompCanonTable() @safe pure nothrow { static _IDCA t = [%( 0x%x, %)]; return t; }", decompCanonFlat); 1048 writefln("_IDCA decompCompatTable() @safe pure nothrow { static _IDCA t = [%( 0x%x, %)]; return t; }", decompCompatFlat); 1049 writeln("}"); 1050 } 1051 } 1052 1053 void writeFunctions(File sink) 1054 { 1055 auto format = general.table["Cf"]; 1056 auto space = general.table["Zs"]; 1057 auto control = general.table["Cc"]; 1058 auto whitespace = general.table["White_Space"]; 1059 1060 //hangul L, V, T 1061 auto hangL = hangul.table["L"]; 1062 auto hangV = hangul.table["V"]; 1063 auto hangT = hangul.table["T"]; 1064 with(sink) 1065 { 1066 writeln(format.toSourceCode("isFormatGen")); 1067 writeln(control.toSourceCode("isControlGen")); 1068 writeln(space.toSourceCode("isSpaceGen")); 1069 writeln(whitespace.toSourceCode("isWhiteGen")); 1070 writeln(hangL.toSourceCode("isHangL")); 1071 writeln(hangV.toSourceCode("isHangV")); 1072 writeln(hangT.toSourceCode("isHangT")); 1073 } 1074 } 1075 1076 1077 void writeCompositionTable(File sink) 1078 { 1079 dchar[dstring] composeTab; 1080 //construct compositions table 1081 foreach (dchar k, dstring v; canonDecomp) 1082 { 1083 if (v.length != 2)//singleton 1084 continue; 1085 if (v[0] in combiningMapping) //non-starter 1086 continue; 1087 if (k in combiningMapping) //combines to non-starter 1088 continue; 1089 if (compExclusions[k]) // non-derivable exclusions 1090 continue; 1091 composeTab[v] = k; 1092 } 1093 1094 Tuple!(dchar, dchar, dchar)[] triples; 1095 foreach (dstring key, dchar val; composeTab) 1096 triples ~= Tuple!(dchar, dchar, dchar)(key[0], key[1], val); 1097 multiSort!("a[0] < b[0]", "a[1] < b[1]")(triples); 1098 //map to the triplets array 1099 ushort[dchar] trimap; 1100 dchar old = triples[0][0]; 1101 auto r = triples[]; 1102 for (size_t idx = 0;;) 1103 { 1104 ptrdiff_t cnt = countUntil!(x => x[0] != old)(r); 1105 if (cnt == -1)//end of input 1106 cnt = r.length; 1107 assert(idx < 2048); 1108 assert(cnt < 32); 1109 trimap[old] = to!ushort(idx | (cnt << 11)); 1110 idx += cnt; 1111 if (idx == triples.length) 1112 break; 1113 old = r[cnt][0]; 1114 r = r[cnt..$]; 1115 } 1116 1117 auto triT = codepointTrie!(ushort, 12, 9)(trimap.toPairs, ushort.max); 1118 auto dupletes = triples.map!(x => tuple(x[1], x[2])).array; 1119 foreach (dstring key, dchar val; composeTab) 1120 { 1121 size_t pack = triT[key[0]]; 1122 assert(pack != ushort.max); 1123 size_t idx = pack & ((1 << 11) - 1), cnt = pack >> 11; 1124 auto f = dupletes[idx .. idx+cnt].find!(x => x[0] == key[1]); 1125 assert(!f.empty); 1126 // & starts with the right value 1127 assert(f.front[1] == val); 1128 } 1129 with(sink) 1130 { 1131 writeln("enum composeIdxMask = (1 << 11) - 1, composeCntShift = 11;"); 1132 write("enum compositionJumpTrieEntries = TrieEntry!(ushort, 12, 9)("); 1133 triT.store(sink.lockingTextWriter()); 1134 writeln(");"); 1135 writeln("@property immutable(CompEntry[]) compositionTable() nothrow pure @nogc @safe"); 1136 writeln("{"); 1137 writeln("alias CE = CompEntry;"); 1138 write("static immutable CE[] t = ["); 1139 foreach (pair; dupletes) 1140 writef("CE(0x%05x, 0x%05x),", pair[0], pair[1]); 1141 writeln("];"); 1142 writeln("return t;"); 1143 writeln("}"); 1144 } 1145 } 1146 1147 void writeCombining(File sink) 1148 { 1149 auto ct = codepointTrie!(ubyte, 7, 5, 9)(combiningMapping.toPairs); 1150 foreach (i, clazz; combiningClass[1 .. 255])//0 is a default for all of 1M+ codepoints 1151 { 1152 foreach (ch; clazz.byCodepoint) 1153 assert(ct[ch] == i+1); 1154 } 1155 writeBest3Level(sink, "combiningClass", combiningMapping); 1156 } 1157 1158 //fussy compare for unicode property names as per UTS-18 1159 int comparePropertyName(Char)(const(Char)[] a, const(Char)[] b) 1160 { 1161 for (;;) 1162 { 1163 while (!a.empty && (isWhite(a.front) || a.front == '-' || a.front =='_')) 1164 { 1165 a.popFront(); 1166 } 1167 while (!b.empty && (isWhite(b.front) || b.front == '-' || b.front =='_')) 1168 { 1169 b.popFront(); 1170 } 1171 if (a.empty) 1172 return b.empty ? 0 : -1; 1173 if (b.empty) 1174 return 1; 1175 // names are all in ASCII either way though whitespace might be unicode 1176 auto ca = std.ascii.toLower(a.front), cb = std.ascii.toLower(b.front); 1177 if (ca > cb) 1178 return 1; 1179 else if ( ca < cb) 1180 return -1; 1181 a.popFront(); 1182 b.popFront(); 1183 } 1184 } 1185 1186 bool propertyNameLess(Char)(const(Char)[] a, const(Char)[] b) 1187 { 1188 return comparePropertyName(a, b) < 0; 1189 } 1190 1191 //meta helpers to generate and pick the best trie by size & levels 1192 1193 void writeBest2Level(Set)(File sink, string name, Set set) 1194 if (isCodepointSet!Set) 1195 { 1196 alias List = TypeTuple!(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 1197 size_t min = size_t.max; 1198 void delegate(File) write; 1199 foreach (lvl_1; List) 1200 { 1201 enum lvl_2 = 21-lvl_1; 1202 auto t = codepointSetTrie!(lvl_1, lvl_2)(set); 1203 if (t.bytes < min) 1204 { 1205 min = t.bytes; 1206 write = createPrinter!(lvl_1, lvl_2)(name, t); 1207 } 1208 } 1209 write(sink); 1210 } 1211 1212 void writeBest2Level(V, K)(File sink, string name, V[K] map, V defValue=V.init) 1213 { 1214 alias List = TypeTuple!(5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16); 1215 size_t min = size_t.max; 1216 void delegate(File) write; 1217 auto range = zip(map.values, map.keys).array; 1218 foreach (lvl_1; List) 1219 { 1220 enum lvl_2 = 21-lvl_1; 1221 alias codepointTrie!(V, lvl_1, lvl_2) CurTrie; 1222 CurTrie t = CurTrie(range, defValue); 1223 if (t.bytes < min) 1224 { 1225 min = t.bytes; 1226 write = createPrinter!(lvl_1, lvl_2)(name, t); 1227 } 1228 } 1229 write(sink); 1230 } 1231 1232 alias List_1 = TypeTuple!(4, 5, 6, 7, 8); 1233 1234 auto writeBest3Level(Set)(File sink, string name, Set set) 1235 if (isCodepointSet!Set) 1236 { 1237 // access speed trumps size, power of 2 is faster to access 1238 // e.g. 9, 5, 7 is far slower then 8, 5, 8 because of how bits breakdown: 1239 // 8-5-8: indexes are 21-8 = 13 bits, 13-5 = 8 bits, fits into a byte 1240 // 9-5-7: indexes are 21-7 = 14 bits, 14-5 = 9 bits, doesn't fit into a byte (!) 1241 1242 // e.g. 8-5-8 is one of hand picked that is a very close match 1243 // to the best packing 1244 void delegate(File) write; 1245 1246 alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1247 size_t min = size_t.max; 1248 foreach (lvl_1; List_1)//to have the first stage index fit in byte 1249 { 1250 foreach (lvl_2; List) 1251 { 1252 static if (lvl_1 + lvl_2 <= 16)//so that 2nd stage fits in ushort 1253 { 1254 enum lvl_3 = 21-lvl_2-lvl_1; 1255 auto t = codepointSetTrie!(lvl_1, lvl_2, lvl_3)(set); 1256 if (t.bytes < min) 1257 { 1258 min = t.bytes; 1259 write = createPrinter!(lvl_1, lvl_2, lvl_3)(name, t); 1260 } 1261 } 1262 } 1263 } 1264 write(sink); 1265 } 1266 1267 void writeBest3Level(V, K)(File sink, string name, V[K] map, V defValue=V.init) 1268 { 1269 void delegate(File) write; 1270 alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); 1271 size_t min = size_t.max; 1272 auto range = zip(map.values, map.keys).array; 1273 foreach (lvl_1; List_1)//to have the first stage index fit in byte 1274 { 1275 foreach (lvl_2; List) 1276 { 1277 static if (lvl_1 + lvl_2 <= 16)// into ushort 1278 { 1279 enum lvl_3 = 21-lvl_2-lvl_1; 1280 auto t = codepointTrie!(V, lvl_1, lvl_2, lvl_3) (range, defValue); 1281 if (t.bytes < min) 1282 { 1283 min = t.bytes; 1284 write = createPrinter!(lvl_1, lvl_2, lvl_3)(name, t); 1285 } 1286 } 1287 } 1288 } 1289 write(sink); 1290 } 1291 1292 void writeBest4Level(Set)(File sink, string name, Set set) 1293 { 1294 alias List = TypeTuple!(4, 5, 6, 7, 8, 9, 10, 11, 12, 13); 1295 size_t min = size_t.max; 1296 void delegate(File) write; 1297 foreach (lvl_1; List_1)//to have the first stage index fit in byte 1298 { 1299 foreach (lvl_2; List) 1300 { 1301 foreach (lvl_3; List) 1302 { 1303 static if (lvl_1 + lvl_2 + lvl_3 <= 16) 1304 { 1305 enum lvl_4 = 21-lvl_3-lvl_2-lvl_1; 1306 auto t = codepointSetTrie!(lvl_1, lvl_2, lvl_3, lvl_4)(set); 1307 if (t.bytes < min) 1308 { 1309 min = t.bytes; 1310 write = createPrinter!(lvl_1, lvl_2, lvl_3, lvl_4)(name, t); 1311 } 1312 } 1313 } 1314 } 1315 } 1316 write(sink); 1317 } 1318 1319 template createPrinter(Params...) 1320 { 1321 void delegate(File) createPrinter(T)(string name, T trie) 1322 { 1323 return (File sink){ 1324 sink.writef("//%d bytes\nenum %sTrieEntries = TrieEntry!(%s", 1325 trie.bytes, name, Unqual!(typeof(T.init[0])).stringof); 1326 foreach (lvl; Params[0..$]) 1327 sink.writef(", %d", lvl); 1328 sink.write(")("); 1329 trie.store(sink.lockingTextWriter()); 1330 sink.writeln(");"); 1331 }; 1332 } 1333 }