1 /++ 2 $(LINK2 https://en.wikipedia.org/wiki/Regular_expression, Regular expressions) 3 are a commonly used method of pattern matching 4 on strings, with $(I regex) being a catchy word for a pattern in this domain 5 specific language. Typical problems usually solved by regular expressions 6 include validation of user input and the ubiquitous find $(AMP) replace 7 in text processing utilities. 8 9 $(SCRIPT inhibitQuickIndex = 1;) 10 $(DIVC quickindex, 11 $(BOOKTABLE, 12 $(TR $(TH Category) $(TH Functions)) 13 $(TR $(TD Matching) $(TD 14 $(LREF bmatch) 15 $(LREF match) 16 $(LREF matchAll) 17 $(LREF matchFirst) 18 )) 19 $(TR $(TD Building) $(TD 20 $(LREF ctRegex) 21 $(LREF escaper) 22 $(LREF regex) 23 )) 24 $(TR $(TD Replace) $(TD 25 $(LREF replace) 26 $(LREF replaceAll) 27 $(LREF replaceAllInto) 28 $(LREF replaceFirst) 29 $(LREF replaceFirstInto) 30 )) 31 $(TR $(TD Split) $(TD 32 $(LREF split) 33 $(LREF splitter) 34 )) 35 $(TR $(TD Objects) $(TD 36 $(LREF Captures) 37 $(LREF Regex) 38 $(LREF RegexException) 39 $(LREF RegexMatch) 40 $(LREF Splitter) 41 $(LREF StaticRegex) 42 )) 43 )) 44 45 $(SECTION Synopsis) 46 47 Create a regex at runtime: 48 $(RUNNABLE_EXAMPLE 49 $(RUNNABLE_EXAMPLE_STDIN 50 They met on 24/01/1970. 51 7/8/99 wasn't as hot as 7/8/2022. 52 ) 53 --- 54 import std.regex; 55 import std.stdio; 56 // Print out all possible dd/mm/yy(yy) dates found in user input. 57 auto r = regex(r"\b[0-9][0-9]?/[0-9][0-9]?/[0-9][0-9](?:[0-9][0-9])?\b"); 58 foreach (line; stdin.byLine) 59 { 60 // matchAll() returns a range that can be iterated 61 // to get all subsequent matches. 62 foreach (c; matchAll(line, r)) 63 writeln(c.hit); 64 } 65 --- 66 ) 67 Create a static regex at compile-time, which contains fast native code: 68 $(RUNNABLE_EXAMPLE 69 --- 70 import std.regex; 71 auto ctr = ctRegex!(`^.*/([^/]+)/?$`); 72 73 // It works just like a normal regex: 74 auto c2 = matchFirst("foo/bar", ctr); // First match found here, if any 75 assert(!c2.empty); // Be sure to check if there is a match before examining contents! 76 assert(c2[1] == "bar"); // Captures is a range of submatches: 0 = full match. 77 --- 78 ) 79 Multi-pattern regex: 80 $(RUNNABLE_EXAMPLE 81 --- 82 import std.regex; 83 auto multi = regex([`\d+,\d+`, `([a-z]+):(\d+)`]); 84 auto m = "abc:43 12,34".matchAll(multi); 85 assert(m.front.whichPattern == 2); 86 assert(m.front[1] == "abc"); 87 assert(m.front[2] == "43"); 88 m.popFront(); 89 assert(m.front.whichPattern == 1); 90 assert(m.front[0] == "12,34"); 91 --- 92 ) 93 $(LREF Captures) and `opCast!bool`: 94 $(RUNNABLE_EXAMPLE 95 --- 96 import std.regex; 97 // The result of `matchAll/matchFirst` is directly testable with `if/assert/while`, 98 // e.g. test if a string consists of letters only: 99 assert(matchFirst("LettersOnly", `^\p{L}+$`)); 100 101 // And we can take advantage of the ability to define a variable in the IfCondition: 102 if (const captures = matchFirst("At l34st one digit, but maybe more...", `((\d)(\d*))`)) 103 { 104 assert(captures[2] == "3"); 105 assert(captures[3] == "4"); 106 assert(captures[1] == "34"); 107 } 108 --- 109 ) 110 See_Also: $(LINK2 https://dlang.org/spec/statement.html#IfCondition, `IfCondition`). 111 112 $(SECTION Syntax and general information) 113 The general usage guideline is to keep regex complexity on the side of simplicity, 114 as its capabilities reside in purely character-level manipulation. 115 As such it's ill-suited for tasks involving higher level invariants 116 like matching an integer number $(U bounded) in an [a,b] interval. 117 Checks of this sort of are better addressed by additional post-processing. 118 119 The basic syntax shouldn't surprise experienced users of regular expressions. 120 For an introduction to `std.regex` see a 121 $(HTTP dlang.org/regular-expression.html, short tour) of the module API 122 and its abilities. 123 124 There are other web resources on regular expressions to help newcomers, 125 and a good $(HTTP www.regular-expressions.info, reference with tutorial) 126 can easily be found. 127 128 This library uses a remarkably common ECMAScript syntax flavor 129 with the following extensions: 130 $(UL 131 $(LI Named subexpressions, with Python syntax. ) 132 $(LI Unicode properties such as Scripts, Blocks and common binary properties e.g Alphabetic, White_Space, Hex_Digit etc.) 133 $(LI Arbitrary length and complexity lookbehind, including lookahead in lookbehind and vise-versa.) 134 ) 135 136 $(REG_START Pattern syntax ) 137 $(I std.regex operates on codepoint level, 138 'character' in this table denotes a single Unicode codepoint.) 139 $(REG_TABLE 140 $(REG_TITLE Pattern element, Semantics ) 141 $(REG_TITLE Atoms, Match single characters ) 142 $(REG_ROW any character except [{|*+?()^$, Matches the character itself. ) 143 $(REG_ROW ., In single line mode matches any character. 144 Otherwise it matches any character except '\n' and '\r'. ) 145 $(REG_ROW [class], Matches a single character 146 that belongs to this character class. ) 147 $(REG_ROW [^class], Matches a single character that 148 does $(U not) belong to this character class.) 149 $(REG_ROW \cC, Matches the control character corresponding to letter C) 150 $(REG_ROW \xXX, Matches a character with hexadecimal value of XX. ) 151 $(REG_ROW \uXXXX, Matches a character with hexadecimal value of XXXX. ) 152 $(REG_ROW \U00YYYYYY, Matches a character with hexadecimal value of YYYYYY. ) 153 $(REG_ROW \f, Matches a formfeed character. ) 154 $(REG_ROW \n, Matches a linefeed character. ) 155 $(REG_ROW \r, Matches a carriage return character. ) 156 $(REG_ROW \t, Matches a tab character. ) 157 $(REG_ROW \v, Matches a vertical tab character. ) 158 $(REG_ROW \d, Matches any Unicode digit. ) 159 $(REG_ROW \D, Matches any character except Unicode digits. ) 160 $(REG_ROW \w, Matches any word character (note: this includes numbers).) 161 $(REG_ROW \W, Matches any non-word character.) 162 $(REG_ROW \s, Matches whitespace, same as \p{White_Space}.) 163 $(REG_ROW \S, Matches any character except those recognized as $(I \s ). ) 164 $(REG_ROW \\\\, Matches \ character. ) 165 $(REG_ROW \c where c is one of [|*+?(), Matches the character c itself. ) 166 $(REG_ROW \p{PropertyName}, Matches a character that belongs 167 to the Unicode PropertyName set. 168 Single letter abbreviations can be used without surrounding {,}. ) 169 $(REG_ROW \P{PropertyName}, Matches a character that does not belong 170 to the Unicode PropertyName set. 171 Single letter abbreviations can be used without surrounding {,}. ) 172 $(REG_ROW \p{InBasicLatin}, Matches any character that is part of 173 the BasicLatin Unicode $(U block).) 174 $(REG_ROW \P{InBasicLatin}, Matches any character except ones in 175 the BasicLatin Unicode $(U block).) 176 $(REG_ROW \p{Cyrillic}, Matches any character that is part of 177 Cyrillic $(U script).) 178 $(REG_ROW \P{Cyrillic}, Matches any character except ones in 179 Cyrillic $(U script).) 180 $(REG_TITLE Quantifiers, Specify repetition of other elements) 181 $(REG_ROW *, Matches previous character/subexpression 0 or more times. 182 Greedy version - tries as many times as possible.) 183 $(REG_ROW *?, Matches previous character/subexpression 0 or more times. 184 Lazy version - stops as early as possible.) 185 $(REG_ROW +, Matches previous character/subexpression 1 or more times. 186 Greedy version - tries as many times as possible.) 187 $(REG_ROW +?, Matches previous character/subexpression 1 or more times. 188 Lazy version - stops as early as possible.) 189 $(REG_ROW ?, Matches previous character/subexpression 0 or 1 time. 190 Greedy version - tries as many times as possible.) 191 $(REG_ROW ??, Matches previous character/subexpression 0 or 1 time. 192 Lazy version - stops as early as possible.) 193 $(REG_ROW {n}, Matches previous character/subexpression exactly n times. ) 194 $(REG_ROW {n$(COMMA)}, Matches previous character/subexpression n times or more. 195 Greedy version - tries as many times as possible. ) 196 $(REG_ROW {n$(COMMA)}?, Matches previous character/subexpression n times or more. 197 Lazy version - stops as early as possible.) 198 $(REG_ROW {n$(COMMA)m}, Matches previous character/subexpression n to m times. 199 Greedy version - tries as many times as possible, but no more than m times. ) 200 $(REG_ROW {n$(COMMA)m}?, Matches previous character/subexpression n to m times. 201 Lazy version - stops as early as possible, but no less then n times.) 202 $(REG_TITLE Other, Subexpressions $(AMP) alternations ) 203 $(REG_ROW (regex), Matches subexpression regex, 204 saving matched portion of text for later retrieval. ) 205 $(REG_ROW (?#comment), An inline comment that is ignored while matching.) 206 $(REG_ROW (?:regex), Matches subexpression regex, 207 $(U not) saving matched portion of text. Useful to speed up matching. ) 208 $(REG_ROW A|B, Matches subexpression A, or failing that, matches B. ) 209 $(REG_ROW (?P$(LT)name$(GT)regex), Matches named subexpression 210 regex labeling it with name 'name'. 211 When referring to a matched portion of text, 212 names work like aliases in addition to direct numbers. 213 ) 214 $(REG_TITLE Assertions, Match position rather than character ) 215 $(REG_ROW ^, Matches at the beginning of input or line (in multiline mode).) 216 $(REG_ROW $, Matches at the end of input or line (in multiline mode). ) 217 $(REG_ROW \b, Matches at word boundary. ) 218 $(REG_ROW \B, Matches when $(U not) at word boundary. ) 219 $(REG_ROW (?=regex), Zero-width lookahead assertion. 220 Matches at a point where the subexpression 221 regex could be matched starting from the current position. 222 ) 223 $(REG_ROW (?!regex), Zero-width negative lookahead assertion. 224 Matches at a point where the subexpression 225 regex could $(U not) be matched starting from the current position. 226 ) 227 $(REG_ROW (?<=regex), Zero-width lookbehind assertion. Matches at a point 228 where the subexpression regex could be matched ending 229 at the current position (matching goes backwards). 230 ) 231 $(REG_ROW (?<!regex), Zero-width negative lookbehind assertion. 232 Matches at a point where the subexpression regex could $(U not) 233 be matched ending at the current position (matching goes backwards). 234 ) 235 ) 236 237 $(REG_START Character classes ) 238 $(REG_TABLE 239 $(REG_TITLE Pattern element, Semantics ) 240 $(REG_ROW Any atom, Has the same meaning as outside of a character class, 241 except for ] which must be written as \\]) 242 $(REG_ROW a-z, Includes characters a, b, c, ..., z. ) 243 $(REG_ROW [a||b]$(COMMA) [a--b]$(COMMA) [a~~b]$(COMMA) [a$(AMP)$(AMP)b], 244 Where a, b are arbitrary classes, means union, set difference, 245 symmetric set difference, and intersection respectively. 246 $(I Any sequence of character class elements implicitly forms a union.) ) 247 ) 248 249 $(REG_START Regex flags ) 250 $(REG_TABLE 251 $(REG_TITLE Flag, Semantics ) 252 $(REG_ROW g, Global regex, repeat over the whole input. ) 253 $(REG_ROW i, Case insensitive matching. ) 254 $(REG_ROW m, Multi-line mode, match ^, $ on start and end line separators 255 as well as start and end of input.) 256 $(REG_ROW s, Single-line mode, makes . match '\n' and '\r' as well. ) 257 $(REG_ROW x, Free-form syntax, ignores whitespace in pattern, 258 useful for formatting complex regular expressions. ) 259 ) 260 261 $(SECTION Unicode support) 262 263 This library provides full Level 1 support* according to 264 $(HTTP unicode.org/reports/tr18/, UTS 18). Specifically: 265 $(UL 266 $(LI 1.1 Hex notation via any of \uxxxx, \U00YYYYYY, \xZZ.) 267 $(LI 1.2 Unicode properties.) 268 $(LI 1.3 Character classes with set operations.) 269 $(LI 1.4 Word boundaries use the full set of "word" characters.) 270 $(LI 1.5 Using simple casefolding to match case 271 insensitively across the full range of codepoints.) 272 $(LI 1.6 Respecting line breaks as any of 273 \u000A | \u000B | \u000C | \u000D | \u0085 | \u2028 | \u2029 | \u000D\u000A.) 274 $(LI 1.7 Operating on codepoint level.) 275 ) 276 *With exception of point 1.1.1, as of yet, normalization of input 277 is expected to be enforced by user. 278 279 $(SECTION Replace format string) 280 281 A set of functions in this module that do the substitution rely 282 on a simple format to guide the process. In particular the table below 283 applies to the `format` argument of 284 $(LREF replaceFirst) and $(LREF replaceAll). 285 286 The format string can reference parts of match using the following notation. 287 $(REG_TABLE 288 $(REG_TITLE Format specifier, Replaced by ) 289 $(REG_ROW $(DOLLAR)$(AMP), the whole match. ) 290 $(REG_ROW $(DOLLAR)$(BACKTICK), part of input $(I preceding) the match. ) 291 $(REG_ROW $', part of input $(I following) the match. ) 292 $(REG_ROW $$, '$' character. ) 293 $(REG_ROW \c $(COMMA) where c is any character, the character c itself. ) 294 $(REG_ROW \\\\, '\\' character. ) 295 $(REG_ROW $(DOLLAR)1 .. $(DOLLAR)99, submatch number 1 to 99 respectively. ) 296 ) 297 298 $(SECTION Slicing and zero memory allocations orientation) 299 300 All matches returned by pattern matching functionality in this library 301 are slices of the original input. The notable exception is the `replace` 302 family of functions that generate a new string from the input. 303 304 In cases where producing the replacement is the ultimate goal 305 $(LREF replaceFirstInto) and $(LREF replaceAllInto) could come in handy 306 as functions that avoid allocations even for replacement. 307 308 Copyright: Copyright Dmitry Olshansky, 2011- 309 310 License: $(HTTP boost.org/LICENSE_1_0.txt, Boost License 1.0). 311 312 Authors: Dmitry Olshansky, 313 314 API and utility constructs are modeled after the original `std.regex` 315 by Walter Bright and Andrei Alexandrescu. 316 317 Source: $(PHOBOSSRC std/regex/package.d) 318 319 Macros: 320 REG_ROW = $(TR $(TD $(I $1 )) $(TD $+) ) 321 REG_TITLE = $(TR $(TD $(B $1)) $(TD $(B $2)) ) 322 REG_TABLE = <table border="1" cellspacing="0" cellpadding="5" > $0 </table> 323 REG_START = <h3><div align="center"> $0 </div></h3> 324 SECTION = <h3><a id="$1" href="#$1" class="anchor">$0</a></h3> 325 S_LINK = <a href="#$1">$+</a> 326 +/ 327 module std.regex; 328 329 import std.range.primitives, std.traits; 330 import std.regex.internal.ir; 331 import std.typecons : Flag, Yes, No; 332 333 /++ 334 `Regex` object holds regular expression pattern in compiled form. 335 336 Instances of this object are constructed via calls to `regex`. 337 This is an intended form for caching and storage of frequently 338 used regular expressions. 339 340 Example: 341 342 Test if this object doesn't contain any compiled pattern. 343 --- 344 Regex!char r; 345 assert(r.empty); 346 r = regex(""); // Note: "" is a valid regex pattern. 347 assert(!r.empty); 348 --- 349 350 Getting a range of all the named captures in the regex. 351 ---- 352 import std.range; 353 import std.algorithm; 354 355 auto re = regex(`(?P<name>\w+) = (?P<var>\d+)`); 356 auto nc = re.namedCaptures; 357 static assert(isRandomAccessRange!(typeof(nc))); 358 assert(!nc.empty); 359 assert(nc.length == 2); 360 assert(nc.equal(["name", "var"])); 361 assert(nc[0] == "name"); 362 assert(nc[1..$].equal(["var"])); 363 ---- 364 +/ 365 public alias Regex(Char) = std.regex.internal.ir.Regex!(Char); 366 367 /++ 368 A `StaticRegex` is `Regex` object that contains D code specially 369 generated at compile-time to speed up matching. 370 371 No longer used, kept as alias to Regex for backwards compatibility. 372 +/ 373 public alias StaticRegex = Regex; 374 375 /++ 376 Compile regular expression pattern for the later execution. 377 Returns: `Regex` object that works on inputs having 378 the same character width as `pattern`. 379 380 Params: 381 pattern = A single regular expression to match. 382 patterns = An array of regular expression strings. 383 The resulting `Regex` object will match any expression; 384 use $(LREF whichPattern) to know which. 385 flags = The _attributes (g, i, m, s and x accepted) 386 387 Throws: `RegexException` if there were any errors during compilation. 388 +/ 389 @trusted public auto regex(S : C[], C)(const S[] patterns, const(char)[] flags="") 390 if (isSomeString!(S)) 391 { 392 import std.array : appender; 393 import std.functional : memoize; 394 enum cacheSize = 8; //TODO: invent nice interface to control regex caching 395 const(C)[] pat; 396 if (patterns.length > 1) 397 { 398 auto app = appender!S(); 399 foreach (i, p; patterns) 400 { 401 if (i != 0) 402 app.put("|"); 403 app.put("(?:"); 404 app.put(patterns[i]); 405 // terminator for the pattern 406 // to detect if the pattern unexpectedly ends 407 app.put("\\"); 408 app.put(cast(dchar)(privateUseStart+i)); 409 app.put(")"); 410 // another one to return correct whichPattern 411 // for all of potential alternatives in the patterns[i] 412 app.put("\\"); 413 app.put(cast(dchar)(privateUseStart+i)); 414 } 415 pat = app.data; 416 } 417 else 418 pat = patterns[0]; 419 420 if (__ctfe) 421 return regexImpl(pat, flags); 422 return memoize!(regexImpl!S, cacheSize)(pat, flags); 423 } 424 425 ///ditto 426 @trusted public auto regex(S)(S pattern, const(char)[] flags="") 427 if (isSomeString!(S)) 428 { 429 return regex([pattern], flags); 430 } 431 432 /// 433 @system unittest 434 { 435 void test(S)() 436 { 437 // multi-pattern regex example 438 S[] arr = [`([a-z]+):(\d+)`, `(\d+),\d+`]; 439 auto multi = regex(arr); // multi regex 440 S str = "abc:43 12,34"; 441 auto m = str.matchAll(multi); 442 assert(m.front.whichPattern == 1); 443 assert(m.front[1] == "abc"); 444 assert(m.front[2] == "43"); 445 m.popFront(); 446 assert(m.front.whichPattern == 2); 447 assert(m.front[1] == "12"); 448 } 449 450 import std.meta : AliasSeq; 451 static foreach (C; AliasSeq!(string, wstring, dstring)) 452 // Test with const array of patterns - see https://issues.dlang.org/show_bug.cgi?id=20301 453 static foreach (S; AliasSeq!(C, const C, immutable C)) 454 test!S(); 455 } 456 457 @system unittest 458 { 459 import std.conv : to; 460 import std.string : indexOf; 461 462 immutable pattern = "s+"; 463 auto regexString = to!string(regex(pattern, "U")); 464 assert(regexString.length <= pattern.length + 100, "String representation shouldn't be unreasonably bloated."); 465 assert(indexOf(regexString, "s+") >= 0, "String representation should include pattern."); 466 assert(indexOf(regexString, 'U') >= 0, "String representation should include flags."); 467 } 468 469 public auto regexImpl(S)(const S pattern, const(char)[] flags="") 470 if (isSomeString!(typeof(pattern))) 471 { 472 import std.regex.internal.parser : Parser, CodeGen; 473 auto parser = Parser!(Unqual!(typeof(pattern)), CodeGen)(pattern, flags); 474 auto r = parser.program; 475 return r; 476 } 477 478 479 private struct CTRegexWrapper(Char) 480 { 481 private immutable(Regex!Char)* re; 482 483 // allow code that expects mutable Regex to still work 484 // we stay "logically const" 485 @property @trusted ref getRe() const { return *cast(Regex!Char*) re; } 486 alias getRe this; 487 } 488 489 template ctRegexImpl(alias pattern, string flags="") 490 { 491 import std.regex.internal.backtracking, std.regex.internal.parser; 492 static immutable r = cast(immutable) regex(pattern, flags); 493 alias Char = BasicElementOf!(typeof(pattern)); 494 enum source = ctGenRegExCode(r); 495 @trusted pure bool func(BacktrackingMatcher!Char matcher) 496 { 497 debug(std_regex_ctr) pragma(msg, source); 498 cast(void) matcher; 499 mixin(source); 500 } 501 static immutable staticRe = 502 cast(immutable) r.withFactory(new CtfeFactory!(BacktrackingMatcher, Char, func)); 503 enum wrapper = CTRegexWrapper!Char(&staticRe); 504 } 505 506 @safe pure unittest 507 { 508 // test compat for logical const workaround 509 static void test(StaticRegex!char) 510 { 511 } 512 enum re = ctRegex!``; 513 test(re); 514 } 515 516 @safe pure unittest 517 { 518 auto re = ctRegex!`foo`; 519 assert(matchFirst("foo", re)); 520 521 // test reassignment 522 re = ctRegex!`bar`; 523 assert(matchFirst("bar", re)); 524 assert(!matchFirst("bar", ctRegex!`foo`)); 525 } 526 527 /++ 528 Compile regular expression using CTFE 529 and generate optimized native machine code for matching it. 530 531 Returns: StaticRegex object for faster matching. 532 533 Params: 534 pattern = Regular expression 535 flags = The _attributes (g, i, m, s and x accepted) 536 +/ 537 public enum ctRegex(alias pattern, string flags="") = ctRegexImpl!(pattern, flags).wrapper; 538 539 enum isRegexFor(RegEx, R) = is(immutable RegEx == immutable Regex!(BasicElementOf!R)) 540 || is(RegEx : const(Regex!(BasicElementOf!R))) 541 || is(immutable RegEx == immutable StaticRegex!(BasicElementOf!R)); 542 543 544 /++ 545 `Captures` object contains submatches captured during a call 546 to `match` or iteration over `RegexMatch` range. 547 548 First element of range is the whole match. 549 +/ 550 @trusted public struct Captures(R) 551 if (isSomeString!R) 552 {//@trusted because of union inside 553 alias DataIndex = size_t; 554 alias String = R; 555 alias Store = SmallFixedArray!(Group!DataIndex, 3); 556 private: 557 import std.conv : text; 558 Store matches; 559 const(NamedGroup)[] _names; 560 R _input; 561 int _nMatch; 562 uint _f, _b; 563 564 this(R input, uint n, const(NamedGroup)[] named) 565 { 566 _input = input; 567 _names = named; 568 matches = Store(n); 569 _b = n; 570 _f = 0; 571 } 572 573 this(ref RegexMatch!R rmatch) 574 { 575 _input = rmatch._input; 576 _names = rmatch._engine.pattern.dict; 577 immutable n = rmatch._engine.pattern.ngroup; 578 matches = Store(n); 579 _b = n; 580 _f = 0; 581 } 582 583 inout(R) getMatch(size_t index) inout 584 { 585 auto m = &matches[index]; 586 return *m ? _input[m.begin .. m.end] : null; 587 } 588 589 public: 590 ///Slice of input prior to the match. 591 @property R pre() 592 { 593 return _nMatch == 0 ? _input[] : _input[0 .. matches[0].begin]; 594 } 595 596 ///Slice of input immediately after the match. 597 @property R post() 598 { 599 return _nMatch == 0 ? _input[] : _input[matches[0].end .. $]; 600 } 601 602 ///Slice of matched portion of input. 603 @property R hit() 604 { 605 assert(_nMatch, "attempted to get hit of an empty match"); 606 return _input[matches[0].begin .. matches[0].end]; 607 } 608 609 ///Range interface. 610 @property R front() 611 { 612 assert(_nMatch, "attempted to get front of an empty match"); 613 return getMatch(_f); 614 } 615 616 ///ditto 617 @property R back() 618 { 619 assert(_nMatch, "attempted to get back of an empty match"); 620 return getMatch(_b - 1); 621 } 622 623 ///ditto 624 void popFront() 625 { 626 assert(!empty); 627 ++_f; 628 } 629 630 ///ditto 631 void popBack() 632 { 633 assert(!empty); 634 --_b; 635 } 636 637 ///ditto 638 @property bool empty() const { return _nMatch == 0 || _f >= _b; } 639 640 ///ditto 641 inout(R) opIndex()(size_t i) inout 642 { 643 assert(_f + i < _b,text("requested submatch number ", i," is out of range")); 644 return getMatch(_f + i); 645 } 646 647 /++ 648 Explicit cast to bool. 649 Useful as a shorthand for !(x.empty) in if and assert statements. 650 651 --- 652 import std.regex; 653 654 assert(!matchFirst("nothing", "something")); 655 --- 656 +/ 657 658 @safe bool opCast(T:bool)() const nothrow { return _nMatch != 0; } 659 660 /++ 661 Number of pattern matched counting, where 1 - the first pattern. 662 Returns 0 on no match. 663 +/ 664 665 @safe @property int whichPattern() const nothrow { return _nMatch; } 666 667 /// 668 @system unittest 669 { 670 import std.regex; 671 assert(matchFirst("abc", "[0-9]+", "[a-z]+").whichPattern == 2); 672 } 673 674 /++ 675 Lookup named submatch. 676 677 --- 678 import std.regex; 679 import std.range; 680 681 auto c = matchFirst("a = 42;", regex(`(?P<var>\w+)\s*=\s*(?P<value>\d+);`)); 682 assert(c["var"] == "a"); 683 assert(c["value"] == "42"); 684 popFrontN(c, 2); 685 //named groups are unaffected by range primitives 686 assert(c["var"] =="a"); 687 assert(c.front == "42"); 688 ---- 689 +/ 690 R opIndex(String)(String i) /*const*/ //@@@BUG@@@ 691 if (isSomeString!String) 692 { 693 size_t index = lookupNamedGroup(_names, i); 694 return getMatch(index); 695 } 696 697 ///Number of matches in this object. 698 @property size_t length() const { return _nMatch == 0 ? 0 : _b - _f; } 699 700 ///A hook for compatibility with original std.regex. 701 @property ref captures(){ return this; } 702 } 703 704 /// 705 @system unittest 706 { 707 import std.range.primitives : popFrontN; 708 709 auto c = matchFirst("@abc#", regex(`(\w)(\w)(\w)`)); 710 assert(c.pre == "@"); // Part of input preceding match 711 assert(c.post == "#"); // Immediately after match 712 assert(c.hit == c[0] && c.hit == "abc"); // The whole match 713 assert(c[2] == "b"); 714 assert(c.front == "abc"); 715 c.popFront(); 716 assert(c.front == "a"); 717 assert(c.back == "c"); 718 c.popBack(); 719 assert(c.back == "b"); 720 popFrontN(c, 2); 721 assert(c.empty); 722 723 assert(!matchFirst("nothing", "something")); 724 725 // Captures that are not matched will be null. 726 c = matchFirst("ac", regex(`a(b)?c`)); 727 assert(c); 728 assert(!c[1]); 729 } 730 731 @system unittest 732 { 733 Captures!string c; 734 string s = "abc"; 735 assert(cast(bool)(c = matchFirst(s, regex("d"))) 736 || cast(bool)(c = matchFirst(s, regex("a")))); 737 } 738 739 // https://issues.dlang.org/show_bug.cgi?id=19979 740 @system unittest 741 { 742 auto c = matchFirst("bad", regex(`(^)(not )?bad($)`)); 743 assert(c[0] && c[0].length == "bad".length); 744 assert(c[1] && !c[1].length); 745 assert(!c[2]); 746 assert(c[3] && !c[3].length); 747 } 748 749 /++ 750 A regex engine state, as returned by `match` family of functions. 751 752 Effectively it's a forward range of Captures!R, produced 753 by lazily searching for matches in a given input. 754 +/ 755 @trusted public struct RegexMatch(R) 756 if (isSomeString!R) 757 { 758 import std.typecons : Rebindable; 759 private: 760 alias Char = BasicElementOf!R; 761 Matcher!Char _engine; 762 Rebindable!(const MatcherFactory!Char) _factory; 763 R _input; 764 Captures!R _captures; 765 766 this(RegEx)(R input, RegEx prog) 767 { 768 import std.exception : enforce; 769 _input = input; 770 if (prog.factory is null) _factory = defaultFactory!Char(prog); 771 else _factory = prog.factory; 772 _engine = _factory.create(prog, input); 773 assert(_engine.refCount == 1); 774 _captures = Captures!R(this); 775 _captures.matches.mutate((slice) pure { _captures._nMatch = _engine.match(slice); }); 776 } 777 778 public: 779 this(this) 780 { 781 if (_engine) _factory.incRef(_engine); 782 } 783 784 ~this() 785 { 786 if (_engine) _factory.decRef(_engine); 787 } 788 789 ///Shorthands for front.pre, front.post, front.hit. 790 @property R pre() 791 { 792 return _captures.pre; 793 } 794 795 ///ditto 796 @property R post() 797 { 798 return _captures.post; 799 } 800 801 ///ditto 802 @property R hit() 803 { 804 return _captures.hit; 805 } 806 807 /++ 808 Functionality for processing subsequent matches of global regexes via range interface: 809 --- 810 import std.regex; 811 auto m = matchAll("Hello, world!", regex(`\w+`)); 812 assert(m.front.hit == "Hello"); 813 m.popFront(); 814 assert(m.front.hit == "world"); 815 m.popFront(); 816 assert(m.empty); 817 --- 818 +/ 819 @property inout(Captures!R) front() inout 820 { 821 return _captures; 822 } 823 824 ///ditto 825 void popFront() 826 { 827 import std.exception : enforce; 828 // CoW - if refCount is not 1, we are aliased by somebody else 829 if (_engine.refCount != 1) 830 { 831 // we create a new engine & abandon this reference 832 auto old = _engine; 833 _engine = _factory.dup(old, _input); 834 _factory.decRef(old); 835 } 836 _captures.matches.mutate((slice) { _captures._nMatch = _engine.match(slice); }); 837 } 838 839 ///ditto 840 auto save(){ return this; } 841 842 ///Test if this match object is empty. 843 @property bool empty() const { return _captures._nMatch == 0; } 844 845 ///Same as !(x.empty), provided for its convenience in conditional statements. 846 T opCast(T:bool)(){ return !empty; } 847 848 /// Same as .front, provided for compatibility with original std.regex. 849 @property inout(Captures!R) captures() inout { return _captures; } 850 } 851 852 private auto matchOnceImpl(RegEx, R)(R input, const auto ref RegEx prog) @trusted 853 { 854 alias Char = BasicElementOf!R; 855 static struct Key 856 { 857 immutable(Char)[] pattern; 858 uint flags; 859 } 860 static Key cacheKey = Key("", -1); 861 static Matcher!Char cache; 862 auto factory = prog.factory is null ? defaultFactory!Char(prog) : prog.factory; 863 auto key = Key(prog.pattern, prog.flags); 864 Matcher!Char engine; 865 if (cacheKey == key) 866 { 867 engine = cache; 868 engine.rearm(input); 869 } 870 else 871 { 872 engine = factory.create(prog, input); 873 if (cache) factory.decRef(cache); // destroy cached engine *after* building a new one 874 cache = engine; 875 cacheKey = key; 876 } 877 auto captures = Captures!R(input, prog.ngroup, prog.dict); 878 captures.matches.mutate((slice) pure { captures._nMatch = engine.match(slice); }); 879 return captures; 880 } 881 882 // matchOnce is constructed as a safe, pure wrapper over matchOnceImpl. It can be 883 // faked as pure because the static mutable variables are used to cache the key and 884 // character matcher. The technique used avoids delegates and GC. 885 private @safe auto matchOnce(RegEx, R)(R input, const auto ref RegEx prog) pure 886 { 887 static auto impl(R input, const ref RegEx prog) 888 { 889 return matchOnceImpl(input, prog); 890 } 891 892 static @trusted auto pureImpl(R input, const ref RegEx prog) 893 { 894 auto p = assumePureFunction(&impl); 895 return p(input, prog); 896 } 897 898 return pureImpl(input, prog); 899 } 900 901 private auto matchMany(RegEx, R)(R input, auto ref RegEx re) @safe 902 { 903 return RegexMatch!R(input, re.withFlags(re.flags | RegexOption.global)); 904 } 905 906 @system unittest 907 { 908 //sanity checks for new API 909 auto re = regex("abc"); 910 assert(!"abc".matchOnce(re).empty); 911 assert("abc".matchOnce(re)[0] == "abc"); 912 } 913 914 // https://issues.dlang.org/show_bug.cgi?id=18135 915 @system unittest 916 { 917 static struct MapResult { RegexMatch!string m; } 918 MapResult m; 919 m = MapResult(); 920 assert(m == m); 921 } 922 923 private enum isReplaceFunctor(alias fun, R) = 924 __traits(compiles, (Captures!R c) { fun(c); }); 925 926 // the lowest level - just stuff replacements into the sink 927 private @trusted void replaceCapturesInto(alias output, Sink, R, T) 928 (ref Sink sink, R input, T captures) 929 if (isOutputRange!(Sink, dchar) && isSomeString!R) 930 { 931 if (captures.empty) 932 { 933 sink.put(input); 934 return; 935 } 936 sink.put(captures.pre); 937 // a hack to get around bogus errors, should be simply output(captures, sink) 938 // "is a nested function and cannot be accessed from" 939 static if (isReplaceFunctor!(output, R)) 940 sink.put(output(captures)); //"mutator" type of function 941 else 942 output(captures, sink); //"output" type of function 943 sink.put(captures.post); 944 } 945 946 // ditto for a range of captures 947 private void replaceMatchesInto(alias output, Sink, R, T) 948 (ref Sink sink, R input, T matches) 949 if (isOutputRange!(Sink, dchar) && isSomeString!R) 950 { 951 size_t offset = 0; 952 foreach (cap; matches) 953 { 954 sink.put(cap.pre[offset .. $]); 955 // same hack, see replaceCapturesInto 956 static if (isReplaceFunctor!(output, R)) 957 sink.put(output(cap)); //"mutator" type of function 958 else 959 output(cap, sink); //"output" type of function 960 offset = cap.pre.length + cap.hit.length; 961 } 962 sink.put(input[offset .. $]); 963 } 964 965 // a general skeleton of replaceFirst 966 private R replaceFirstWith(alias output, R, RegEx)(R input, RegEx re) 967 if (isSomeString!R && isRegexFor!(RegEx, R)) 968 { 969 import std.array : appender; 970 auto data = matchFirst(input, re); 971 if (data.empty) 972 return input; 973 auto app = appender!(R)(); 974 replaceCapturesInto!output(app, input, data); 975 return app.data; 976 } 977 978 // ditto for replaceAll 979 // the method parameter allows old API to ride on the back of the new one 980 private R replaceAllWith(alias output, 981 alias method=matchAll, R, RegEx)(R input, RegEx re) 982 if (isSomeString!R && isRegexFor!(RegEx, R)) 983 { 984 import std.array : appender; 985 auto matches = method(input, re); //inout(C)[] fails 986 if (matches.empty) 987 return input; 988 auto app = appender!(R)(); 989 replaceMatchesInto!output(app, input, matches); 990 return app.data; 991 } 992 993 994 /++ 995 Start matching `input` to regex pattern `re`, 996 using Thompson NFA matching scheme. 997 998 The use of this function is $(RED discouraged) - use either of 999 $(LREF matchAll) or $(LREF matchFirst). 1000 1001 Delegating the kind of operation 1002 to "g" flag is soon to be phased out along with the 1003 ability to choose the exact matching scheme. The choice of 1004 matching scheme to use depends highly on the pattern kind and 1005 can done automatically on case by case basis. 1006 1007 Returns: a `RegexMatch` object holding engine state after first match. 1008 +/ 1009 1010 public auto match(R, RegEx)(R input, RegEx re) 1011 if (isSomeString!R && isRegexFor!(RegEx,R)) 1012 { 1013 return RegexMatch!(Unqual!(typeof(input)))(input, re); 1014 } 1015 1016 ///ditto 1017 public auto match(R, String)(R input, String re) 1018 if (isSomeString!R && isSomeString!String) 1019 { 1020 return RegexMatch!(Unqual!(typeof(input)))(input, regex(re)); 1021 } 1022 1023 /++ 1024 Find the first (leftmost) slice of the `input` that 1025 matches the pattern `re`. This function picks the most suitable 1026 regular expression engine depending on the pattern properties. 1027 1028 `re` parameter can be one of three types: 1029 $(UL 1030 $(LI Plain string(s), in which case it's compiled to bytecode before matching. ) 1031 $(LI Regex!char (wchar/dchar) that contains a pattern in the form of 1032 compiled bytecode. ) 1033 $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of 1034 compiled native machine code. ) 1035 ) 1036 1037 Returns: 1038 $(LREF Captures) containing the extent of a match together with all submatches 1039 if there was a match, otherwise an empty $(LREF Captures) object. 1040 +/ 1041 public auto matchFirst(R, RegEx)(R input, RegEx re) 1042 if (isSomeString!R && isRegexFor!(RegEx, R)) 1043 { 1044 return matchOnce(input, re); 1045 } 1046 1047 ///ditto 1048 public auto matchFirst(R, String)(R input, String re) 1049 if (isSomeString!R && isSomeString!String) 1050 { 1051 return matchOnce(input, regex(re)); 1052 } 1053 1054 ///ditto 1055 public auto matchFirst(R, String)(R input, String[] re...) 1056 if (isSomeString!R && isSomeString!String) 1057 { 1058 return matchOnce(input, regex(re)); 1059 } 1060 1061 /++ 1062 Initiate a search for all non-overlapping matches to the pattern `re` 1063 in the given `input`. The result is a lazy range of matches generated 1064 as they are encountered in the input going left to right. 1065 1066 This function picks the most suitable regular expression engine 1067 depending on the pattern properties. 1068 1069 `re` parameter can be one of three types: 1070 $(UL 1071 $(LI Plain string(s), in which case it's compiled to bytecode before matching. ) 1072 $(LI Regex!char (wchar/dchar) that contains a pattern in the form of 1073 compiled bytecode. ) 1074 $(LI StaticRegex!char (wchar/dchar) that contains a pattern in the form of 1075 compiled native machine code. ) 1076 ) 1077 1078 Returns: 1079 $(LREF RegexMatch) object that represents matcher state 1080 after the first match was found or an empty one if not present. 1081 +/ 1082 public auto matchAll(R, RegEx)(R input, RegEx re) 1083 if (isSomeString!R && isRegexFor!(RegEx, R)) 1084 { 1085 return matchMany(input, re); 1086 } 1087 1088 ///ditto 1089 public auto matchAll(R, String)(R input, String re) 1090 if (isSomeString!R && isSomeString!String) 1091 { 1092 return matchMany(input, regex(re)); 1093 } 1094 1095 ///ditto 1096 public auto matchAll(R, String)(R input, String[] re...) 1097 if (isSomeString!R && isSomeString!String) 1098 { 1099 return matchMany(input, regex(re)); 1100 } 1101 1102 // another set of tests just to cover the new API 1103 @system unittest 1104 { 1105 import std.algorithm.comparison : equal; 1106 import std.algorithm.iteration : map; 1107 import std.conv : to; 1108 1109 static foreach (String; AliasSeq!(string, wstring, const(dchar)[])) 1110 {{ 1111 auto str1 = "blah-bleh".to!String(); 1112 auto pat1 = "bl[ae]h".to!String(); 1113 auto mf = matchFirst(str1, pat1); 1114 assert(mf.equal(["blah".to!String()])); 1115 auto mAll = matchAll(str1, pat1); 1116 assert(mAll.equal!((a,b) => a.equal(b)) 1117 ([["blah".to!String()], ["bleh".to!String()]])); 1118 1119 auto str2 = "1/03/12 - 3/03/12".to!String(); 1120 auto pat2 = regex([r"(\d+)/(\d+)/(\d+)".to!String(), "abc".to!String]); 1121 auto mf2 = matchFirst(str2, pat2); 1122 assert(mf2.equal(["1/03/12", "1", "03", "12"].map!(to!String)())); 1123 auto mAll2 = matchAll(str2, pat2); 1124 assert(mAll2.front.equal(mf2)); 1125 mAll2.popFront(); 1126 assert(mAll2.front.equal(["3/03/12", "3", "03", "12"].map!(to!String)())); 1127 mf2.popFrontN(3); 1128 assert(mf2.equal(["12".to!String()])); 1129 1130 auto ctPat = ctRegex!(`(?P<Quot>\d+)/(?P<Denom>\d+)`.to!String()); 1131 auto str = "2 + 34/56 - 6/1".to!String(); 1132 auto cmf = matchFirst(str, ctPat); 1133 assert(cmf.equal(["34/56", "34", "56"].map!(to!String)())); 1134 assert(cmf["Quot"] == "34".to!String()); 1135 assert(cmf["Denom"] == "56".to!String()); 1136 1137 auto cmAll = matchAll(str, ctPat); 1138 assert(cmAll.front.equal(cmf)); 1139 cmAll.popFront(); 1140 assert(cmAll.front.equal(["6/1", "6", "1"].map!(to!String)())); 1141 }} 1142 } 1143 1144 /++ 1145 Start matching of `input` to regex pattern `re`, 1146 using traditional $(LINK2 https://en.wikipedia.org/wiki/Backtracking, 1147 backtracking) matching scheme. 1148 1149 The use of this function is $(RED discouraged) - use either of 1150 $(LREF matchAll) or $(LREF matchFirst). 1151 1152 Delegating the kind of operation 1153 to "g" flag is soon to be phased out along with the 1154 ability to choose the exact matching scheme. The choice of 1155 matching scheme to use depends highly on the pattern kind and 1156 can done automatically on case by case basis. 1157 1158 Returns: a `RegexMatch` object holding engine 1159 state after first match. 1160 1161 +/ 1162 public auto bmatch(R, RegEx)(R input, RegEx re) 1163 if (isSomeString!R && isRegexFor!(RegEx, R)) 1164 { 1165 return RegexMatch!(Unqual!(typeof(input)))(input, re); 1166 } 1167 1168 ///ditto 1169 public auto bmatch(R, String)(R input, String re) 1170 if (isSomeString!R && isSomeString!String) 1171 { 1172 return RegexMatch!(Unqual!(typeof(input)))(input, regex(re)); 1173 } 1174 1175 // produces replacement string from format using captures for substitution 1176 package void replaceFmt(R, Capt, OutR) 1177 (R format, Capt captures, OutR sink, bool ignoreBadSubs = false) 1178 if (isOutputRange!(OutR, ElementEncodingType!R[]) && 1179 isOutputRange!(OutR, ElementEncodingType!(Capt.String)[])) 1180 { 1181 import std.algorithm.searching : find; 1182 import std.ascii : isDigit, isAlpha; 1183 import std.conv : text, parse; 1184 import std.exception : enforce; 1185 enum State { Normal, Dollar } 1186 auto state = State.Normal; 1187 size_t offset; 1188 L_Replace_Loop: 1189 while (!format.empty) 1190 final switch (state) 1191 { 1192 case State.Normal: 1193 for (offset = 0; offset < format.length; offset++)//no decoding 1194 { 1195 if (format[offset] == '$') 1196 { 1197 state = State.Dollar; 1198 sink.put(format[0 .. offset]); 1199 format = format[offset+1 .. $];//ditto 1200 continue L_Replace_Loop; 1201 } 1202 } 1203 sink.put(format[0 .. offset]); 1204 format = format[offset .. $]; 1205 break; 1206 case State.Dollar: 1207 if (isDigit(format[0])) 1208 { 1209 uint digit = parse!uint(format); 1210 enforce(ignoreBadSubs || digit < captures.length, text("invalid submatch number ", digit)); 1211 if (digit < captures.length) 1212 sink.put(captures[digit]); 1213 } 1214 else if (format[0] == '{') 1215 { 1216 auto x = find!(a => !isAlpha(a))(format[1..$]); 1217 enforce(!x.empty && x[0] == '}', "no matching '}' in replacement format"); 1218 auto name = format[1 .. $ - x.length]; 1219 format = x[1..$]; 1220 enforce(!name.empty, "invalid name in ${...} replacement format"); 1221 sink.put(captures[name]); 1222 } 1223 else if (format[0] == '&') 1224 { 1225 sink.put(captures[0]); 1226 format = format[1 .. $]; 1227 } 1228 else if (format[0] == '`') 1229 { 1230 sink.put(captures.pre); 1231 format = format[1 .. $]; 1232 } 1233 else if (format[0] == '\'') 1234 { 1235 sink.put(captures.post); 1236 format = format[1 .. $]; 1237 } 1238 else if (format[0] == '$') 1239 { 1240 sink.put(format[0 .. 1]); 1241 format = format[1 .. $]; 1242 } 1243 state = State.Normal; 1244 break; 1245 } 1246 enforce(state == State.Normal, "invalid format string in regex replace"); 1247 } 1248 1249 /++ 1250 Construct a new string from `input` by replacing the first match with 1251 a string generated from it according to the `format` specifier. 1252 1253 To replace all matches use $(LREF replaceAll). 1254 1255 Params: 1256 input = string to search 1257 re = compiled regular expression to use 1258 format = _format string to generate replacements from, 1259 see $(S_LINK Replace _format string, the _format string). 1260 1261 Returns: 1262 A string of the same type with the first match (if any) replaced. 1263 If no match is found returns the input string itself. 1264 +/ 1265 public R replaceFirst(R, C, RegEx)(R input, RegEx re, const(C)[] format) 1266 if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) 1267 { 1268 return replaceFirstWith!((m, sink) => replaceFmt(format, m, sink))(input, re); 1269 } 1270 1271 /// 1272 @system unittest 1273 { 1274 assert(replaceFirst("noon", regex("n"), "[$&]") == "[n]oon"); 1275 } 1276 1277 /++ 1278 This is a general replacement tool that construct a new string by replacing 1279 matches of pattern `re` in the `input`. Unlike the other overload 1280 there is no format string instead captures are passed to 1281 to a user-defined functor `fun` that returns a new string 1282 to use as replacement. 1283 1284 This version replaces the first match in `input`, 1285 see $(LREF replaceAll) to replace the all of the matches. 1286 1287 Returns: 1288 A new string of the same type as `input` with all matches 1289 replaced by return values of `fun`. If no matches found 1290 returns the `input` itself. 1291 +/ 1292 public R replaceFirst(alias fun, R, RegEx)(R input, RegEx re) 1293 if (isSomeString!R && isRegexFor!(RegEx, R)) 1294 { 1295 return replaceFirstWith!((m, sink) => sink.put(fun(m)))(input, re); 1296 } 1297 1298 /// 1299 @system unittest 1300 { 1301 import std.conv : to; 1302 string list = "#21 out of 46"; 1303 string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1)) 1304 (list, regex(`[0-9]+`)); 1305 assert(newList == "#22 out of 46"); 1306 } 1307 1308 /++ 1309 A variation on $(LREF replaceFirst) that instead of allocating a new string 1310 on each call outputs the result piece-wise to the `sink`. In particular 1311 this enables efficient construction of a final output incrementally. 1312 1313 Like in $(LREF replaceFirst) family of functions there is an overload 1314 for the substitution guided by the `format` string 1315 and the one with the user defined callback. 1316 +/ 1317 public @trusted void replaceFirstInto(Sink, R, C, RegEx) 1318 (ref Sink sink, R input, RegEx re, const(C)[] format) 1319 if (isOutputRange!(Sink, dchar) && isSomeString!R 1320 && is(C : dchar) && isRegexFor!(RegEx, R)) 1321 { 1322 replaceCapturesInto!((m, sink) => replaceFmt(format, m, sink)) 1323 (sink, input, matchFirst(input, re)); 1324 } 1325 1326 ///ditto 1327 public @trusted void replaceFirstInto(alias fun, Sink, R, RegEx) 1328 (Sink sink, R input, RegEx re) 1329 if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) 1330 { 1331 replaceCapturesInto!fun(sink, input, matchFirst(input, re)); 1332 } 1333 1334 /// 1335 @system unittest 1336 { 1337 import std.array; 1338 string m1 = "first message\n"; 1339 string m2 = "second message\n"; 1340 auto result = appender!string(); 1341 replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1"); 1342 //equivalent of the above with user-defined callback 1343 replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`)); 1344 assert(result.data == "first\nsecond\n"); 1345 } 1346 1347 //examples for replaceFirst 1348 @system unittest 1349 { 1350 import std.conv; 1351 string list = "#21 out of 46"; 1352 string newList = replaceFirst!(cap => to!string(to!int(cap.hit)+1)) 1353 (list, regex(`[0-9]+`)); 1354 assert(newList == "#22 out of 46"); 1355 import std.array; 1356 string m1 = "first message\n"; 1357 string m2 = "second message\n"; 1358 auto result = appender!string(); 1359 replaceFirstInto(result, m1, regex(`([a-z]+) message`), "$1"); 1360 //equivalent of the above with user-defined callback 1361 replaceFirstInto!(cap=>cap[1])(result, m2, regex(`([a-z]+) message`)); 1362 assert(result.data == "first\nsecond\n"); 1363 } 1364 1365 /++ 1366 Construct a new string from `input` by replacing all of the 1367 fragments that match a pattern `re` with a string generated 1368 from the match according to the `format` specifier. 1369 1370 To replace only the first match use $(LREF replaceFirst). 1371 1372 Params: 1373 input = string to search 1374 re = compiled regular expression to use 1375 format = _format string to generate replacements from, 1376 see $(S_LINK Replace _format string, the _format string). 1377 1378 Returns: 1379 A string of the same type as `input` with the all 1380 of the matches (if any) replaced. 1381 If no match is found returns the input string itself. 1382 +/ 1383 public @trusted R replaceAll(R, C, RegEx)(R input, RegEx re, const(C)[] format) 1384 if (isSomeString!R && is(C : dchar) && isRegexFor!(RegEx, R)) 1385 { 1386 return replaceAllWith!((m, sink) => replaceFmt(format, m, sink))(input, re); 1387 } 1388 1389 /// 1390 @system unittest 1391 { 1392 // insert comma as thousands delimiter 1393 auto re = regex(r"(?<=\d)(?=(\d\d\d)+\b)","g"); 1394 assert(replaceAll("12000 + 42100 = 54100", re, ",") == "12,000 + 42,100 = 54,100"); 1395 } 1396 1397 /++ 1398 This is a general replacement tool that construct a new string by replacing 1399 matches of pattern `re` in the `input`. Unlike the other overload 1400 there is no format string instead captures are passed to 1401 to a user-defined functor `fun` that returns a new string 1402 to use as replacement. 1403 1404 This version replaces all of the matches found in `input`, 1405 see $(LREF replaceFirst) to replace the first match only. 1406 1407 Returns: 1408 A new string of the same type as `input` with all matches 1409 replaced by return values of `fun`. If no matches found 1410 returns the `input` itself. 1411 1412 Params: 1413 input = string to search 1414 re = compiled regular expression 1415 fun = delegate to use 1416 +/ 1417 public @trusted R replaceAll(alias fun, R, RegEx)(R input, RegEx re) 1418 if (isSomeString!R && isRegexFor!(RegEx, R)) 1419 { 1420 return replaceAllWith!((m, sink) => sink.put(fun(m)))(input, re); 1421 } 1422 1423 /// 1424 @system unittest 1425 { 1426 string baz(Captures!(string) m) 1427 { 1428 import std.string : toUpper; 1429 return toUpper(m.hit); 1430 } 1431 // Capitalize the letters 'a' and 'r': 1432 auto s = replaceAll!(baz)("Strap a rocket engine on a chicken.", 1433 regex("[ar]")); 1434 assert(s == "StRAp A Rocket engine on A chicken."); 1435 } 1436 1437 /++ 1438 A variation on $(LREF replaceAll) that instead of allocating a new string 1439 on each call outputs the result piece-wise to the `sink`. In particular 1440 this enables efficient construction of a final output incrementally. 1441 1442 As with $(LREF replaceAll) there are 2 overloads - one with a format string, 1443 the other one with a user defined functor. 1444 +/ 1445 public @trusted void replaceAllInto(Sink, R, C, RegEx) 1446 (Sink sink, R input, RegEx re, const(C)[] format) 1447 if (isOutputRange!(Sink, dchar) && isSomeString!R 1448 && is(C : dchar) && isRegexFor!(RegEx, R)) 1449 { 1450 replaceMatchesInto!((m, sink) => replaceFmt(format, m, sink)) 1451 (sink, input, matchAll(input, re)); 1452 } 1453 1454 ///ditto 1455 public @trusted void replaceAllInto(alias fun, Sink, R, RegEx) 1456 (Sink sink, R input, RegEx re) 1457 if (isOutputRange!(Sink, dchar) && isSomeString!R && isRegexFor!(RegEx, R)) 1458 { 1459 replaceMatchesInto!fun(sink, input, matchAll(input, re)); 1460 } 1461 1462 /// 1463 @system unittest 1464 { 1465 // insert comma as thousands delimiter in fifty randomly produced big numbers 1466 import std.array, std.conv, std.random, std.range; 1467 static re = regex(`(?<=\d)(?=(\d\d\d)+\b)`, "g"); 1468 auto sink = appender!(char [])(); 1469 enum ulong min = 10UL ^^ 10, max = 10UL ^^ 19; 1470 foreach (i; 0 .. 50) 1471 { 1472 sink.clear(); 1473 replaceAllInto(sink, text(uniform(min, max)), re, ","); 1474 foreach (pos; iota(sink.data.length - 4, 0, -4)) 1475 assert(sink.data[pos] == ','); 1476 } 1477 } 1478 1479 // exercise all of the replace APIs 1480 @system unittest 1481 { 1482 import std.array : appender; 1483 import std.conv; 1484 // try and check first/all simple substitution 1485 static foreach (S; AliasSeq!(string, wstring, dstring, char[], wchar[], dchar[])) 1486 {{ 1487 S s1 = "curt trial".to!S(); 1488 S s2 = "round dome".to!S(); 1489 S t1F = "court trial".to!S(); 1490 S t2F = "hound dome".to!S(); 1491 S t1A = "court trial".to!S(); 1492 S t2A = "hound home".to!S(); 1493 auto re1 = regex("curt".to!S()); 1494 auto re2 = regex("[dr]o".to!S()); 1495 1496 assert(replaceFirst(s1, re1, "court") == t1F); 1497 assert(replaceFirst(s2, re2, "ho") == t2F); 1498 assert(replaceAll(s1, re1, "court") == t1A); 1499 assert(replaceAll(s2, re2, "ho") == t2A); 1500 1501 auto rep1 = replaceFirst!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1); 1502 assert(rep1 == t1F); 1503 assert(replaceFirst!(cap => "ho".to!S())(s2, re2) == t2F); 1504 auto rep1A = replaceAll!(cap => cap[0][0]~"o".to!S()~cap[0][1..$])(s1, re1); 1505 assert(rep1A == t1A); 1506 assert(replaceAll!(cap => "ho".to!S())(s2, re2) == t2A); 1507 1508 auto sink = appender!S(); 1509 replaceFirstInto(sink, s1, re1, "court"); 1510 assert(sink.data == t1F); 1511 replaceFirstInto(sink, s2, re2, "ho"); 1512 assert(sink.data == t1F~t2F); 1513 replaceAllInto(sink, s1, re1, "court"); 1514 assert(sink.data == t1F~t2F~t1A); 1515 replaceAllInto(sink, s2, re2, "ho"); 1516 assert(sink.data == t1F~t2F~t1A~t2A); 1517 }} 1518 } 1519 1520 /++ 1521 Old API for replacement, operation depends on flags of pattern `re`. 1522 With "g" flag it performs the equivalent of $(LREF replaceAll) otherwise it 1523 works the same as $(LREF replaceFirst). 1524 1525 The use of this function is $(RED discouraged), please use $(LREF replaceAll) 1526 or $(LREF replaceFirst) explicitly. 1527 +/ 1528 public R replace(alias scheme = match, R, C, RegEx)(R input, RegEx re, const(C)[] format) 1529 if (isSomeString!R && isRegexFor!(RegEx, R)) 1530 { 1531 return replaceAllWith!((m, sink) => replaceFmt(format, m, sink), match)(input, re); 1532 } 1533 1534 ///ditto 1535 public R replace(alias fun, R, RegEx)(R input, RegEx re) 1536 if (isSomeString!R && isRegexFor!(RegEx, R)) 1537 { 1538 return replaceAllWith!(fun, match)(input, re); 1539 } 1540 1541 /** 1542 Splits a string `r` using a regular expression `pat` as a separator. 1543 1544 Params: 1545 keepSeparators = flag to specify if the matches should be in the resulting range 1546 r = the string to split 1547 pat = the pattern to split on 1548 Returns: 1549 A lazy range of strings 1550 */ 1551 public struct Splitter(Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, alias RegEx = Regex) 1552 if (isSomeString!Range && isRegexFor!(RegEx, Range)) 1553 { 1554 private: 1555 Range _input; 1556 size_t _offset; 1557 alias Rx = typeof(match(Range.init,RegEx.init)); 1558 Rx _match; 1559 1560 static if (keepSeparators) bool onMatch = false; 1561 1562 @trusted this(Range input, RegEx separator) 1563 {//@@@BUG@@@ generated opAssign of RegexMatch is not @trusted 1564 _input = input; 1565 const re = separator.withFlags(separator.flags | RegexOption.global); 1566 if (_input.empty) 1567 { 1568 //there is nothing to match at all, make _offset > 0 1569 _offset = 1; 1570 } 1571 else 1572 { 1573 _match = Rx(_input, re); 1574 1575 static if (keepSeparators) 1576 if (_match.pre.empty) 1577 popFront(); 1578 } 1579 } 1580 1581 public: 1582 auto ref opSlice() 1583 { 1584 return this.save; 1585 } 1586 1587 ///Forward range primitives. 1588 @property Range front() 1589 { 1590 import std.algorithm.comparison : min; 1591 1592 assert(!empty && _offset <= _match.pre.length 1593 && _match.pre.length <= _input.length); 1594 1595 static if (keepSeparators) 1596 { 1597 if (!onMatch) 1598 return _input[_offset .. min($, _match.pre.length)]; 1599 else 1600 return _match.hit(); 1601 } 1602 else 1603 { 1604 return _input[_offset .. min($, _match.pre.length)]; 1605 } 1606 } 1607 1608 ///ditto 1609 @property bool empty() 1610 { 1611 static if (keepSeparators) 1612 return _offset >= _input.length; 1613 else 1614 return _offset > _input.length; 1615 } 1616 1617 ///ditto 1618 void popFront() 1619 { 1620 assert(!empty); 1621 if (_match.empty) 1622 { 1623 //No more separators, work is done here 1624 _offset = _input.length + 1; 1625 } 1626 else 1627 { 1628 static if (keepSeparators) 1629 { 1630 if (!onMatch) 1631 { 1632 //skip past the separator 1633 _offset = _match.pre.length; 1634 } 1635 else 1636 { 1637 _offset += _match.hit.length; 1638 _match.popFront(); 1639 } 1640 1641 onMatch = !onMatch; 1642 } 1643 else 1644 { 1645 //skip past the separator 1646 _offset = _match.pre.length + _match.hit.length; 1647 _match.popFront(); 1648 } 1649 } 1650 } 1651 1652 ///ditto 1653 @property auto save() 1654 { 1655 return this; 1656 } 1657 } 1658 1659 /// ditto 1660 public Splitter!(keepSeparators, Range, RegEx) splitter( 1661 Flag!"keepSeparators" keepSeparators = No.keepSeparators, Range, RegEx)(Range r, RegEx pat) 1662 if ( 1663 is(BasicElementOf!Range : dchar) && isRegexFor!(RegEx, Range)) 1664 { 1665 return Splitter!(keepSeparators, Range, RegEx)(r, pat); 1666 } 1667 1668 /// 1669 @system unittest 1670 { 1671 import std.algorithm.comparison : equal; 1672 auto s1 = ", abc, de, fg, hi, "; 1673 assert(equal(splitter(s1, regex(", *")), 1674 ["", "abc", "de", "fg", "hi", ""])); 1675 } 1676 1677 /// Split on a pattern, but keep the matches in the resulting range 1678 @system unittest 1679 { 1680 import std.algorithm.comparison : equal; 1681 import std.typecons : Yes; 1682 1683 auto pattern = regex(`([\.,])`); 1684 1685 assert("2003.04.05" 1686 .splitter!(Yes.keepSeparators)(pattern) 1687 .equal(["2003", ".", "04", ".", "05"])); 1688 1689 assert(",1,2,3" 1690 .splitter!(Yes.keepSeparators)(pattern) 1691 .equal([",", "1", ",", "2", ",", "3"])); 1692 } 1693 1694 ///An eager version of `splitter` that creates an array with splitted slices of `input`. 1695 public @trusted String[] split(String, RegEx)(String input, RegEx rx) 1696 if (isSomeString!String && isRegexFor!(RegEx, String)) 1697 { 1698 import std.array : appender; 1699 auto a = appender!(String[])(); 1700 foreach (e; splitter(input, rx)) 1701 a.put(e); 1702 return a.data; 1703 } 1704 1705 ///Exception object thrown in case of errors during regex compilation. 1706 public alias RegexException = std.regex.internal.ir.RegexException; 1707 1708 /++ 1709 A range that lazily produces a string output escaped 1710 to be used inside of a regular expression. 1711 +/ 1712 auto escaper(Range)(Range r) 1713 { 1714 import std.algorithm.searching : find; 1715 static immutable escapables = [Escapables]; 1716 static struct Escaper // template to deduce attributes 1717 { 1718 Range r; 1719 bool escaped; 1720 1721 @property ElementType!Range front(){ 1722 if (escaped) 1723 return '\\'; 1724 else 1725 return r.front; 1726 } 1727 1728 @property bool empty(){ return r.empty; } 1729 1730 void popFront(){ 1731 if (escaped) escaped = false; 1732 else 1733 { 1734 r.popFront(); 1735 if (!r.empty && !escapables.find(r.front).empty) 1736 escaped = true; 1737 } 1738 } 1739 1740 @property auto save(){ return Escaper(r.save, escaped); } 1741 } 1742 1743 bool escaped = !r.empty && !escapables.find(r.front).empty; 1744 return Escaper(r, escaped); 1745 } 1746 1747 /// 1748 @system unittest 1749 { 1750 import std.algorithm.comparison; 1751 import std.regex; 1752 string s = `This is {unfriendly} to *regex*`; 1753 assert(s.escaper.equal(`This is \{unfriendly\} to \*regex\*`)); 1754 } 1755 1756 @system unittest 1757 { 1758 import std.algorithm.comparison; 1759 import std.conv; 1760 static foreach (S; AliasSeq!(string, wstring, dstring)) 1761 {{ 1762 auto s = "^".to!S; 1763 assert(s.escaper.equal(`\^`)); 1764 auto s2 = ""; 1765 assert(s2.escaper.equal("")); 1766 }} 1767 } 1768 1769 @system unittest 1770 { 1771 assert("ab".matchFirst(regex(`a?b?`)).hit == "ab"); 1772 assert("ab".matchFirst(regex(`a??b?`)).hit == ""); 1773 }