std.uri source code

1 // Written in the D programming language.
2 
3 /**
4  * Encode and decode Uniform Resource Identifiers (URIs).
5  * URIs are used in internet transfer protocols.
6  * Valid URI characters consist of letters, digits,
7  * and the characters $(B ;/?:@&amp;=+$,-_.!~*'())
8  * Reserved URI characters are $(B ;/?:@&amp;=+$,)
9  * Escape sequences consist of $(B %) followed by two hex digits.
10  *
11  * See_Also:
12  *  $(LINK2 https://www.ietf.org/rfc/rfc3986.txt, RFC 3986)<br>
13  *  $(LINK2 http://en.wikipedia.org/wiki/Uniform_resource_identifier, Wikipedia)
14  * Copyright: Copyright The D Language Foundation 2000 - 2009.
15  * License:   $(HTTP www.boost.org/LICENSE_1_0.txt, Boost License 1.0).
16  * Authors:   $(HTTP digitalmars.com, Walter Bright)
17  * Source:    $(PHOBOSSRC std/uri.d)
18  */
19 /*          Copyright The D Language Foundation 2000 - 2009.
20  * Distributed under the Boost Software License, Version 1.0.
21  *    (See accompanying file LICENSE_1_0.txt or copy at
22  *          http://www.boost.org/LICENSE_1_0.txt)
23  */
24 module std.uri;
25 
26 //debug=uri;        // uncomment to turn on debugging writefln's
27 debug(uri) import std.stdio;
28 import std.traits : isSomeChar;
29 
30 /** This Exception is thrown if something goes wrong when encoding or
31 decoding a URI.
32 */
33 class URIException : Exception
34 {
35     import std.exception : basicExceptionCtors;
36     mixin basicExceptionCtors;
37 }
38 
39 ///
40 @safe unittest
41 {
42     import std.exception : assertThrown;
43     assertThrown!URIException("%ab".decode);
44 }
45 
46 private enum
47 {
48     URI_Alpha = 1,
49     URI_Reserved = 2,
50     URI_Mark = 4,
51     URI_Digit = 8,
52     URI_Hash = 0x10,        // '#'
53 }
54 
55 private immutable char[16] hex2ascii = "0123456789ABCDEF";
56 
57 private immutable ubyte[128] uri_flags =      // indexed by character
58     ({
59         ubyte[128] uflags;
60 
61         // Compile time initialize
62         uflags['#'] |= URI_Hash;
63 
64         foreach (c; 'A' .. 'Z' + 1)
65         {
66             uflags[c] |= URI_Alpha;
67             uflags[c + 0x20] |= URI_Alpha;   // lowercase letters
68         }
69         foreach (c; '0' .. '9' + 1) uflags[c] |= URI_Digit;
70         foreach (c; ";/?:@&=+$,")   uflags[c] |= URI_Reserved;
71         foreach (c; "-_.!~*'()")    uflags[c] |= URI_Mark;
72         return uflags;
73     })();
74 
75 private string URI_Encode(dstring str, uint unescapedSet) @safe pure
76 {
77     uint j;
78     uint k;
79     dchar V;
80     dchar C;
81 
82     // result buffer
83     char[50] buffer = void;
84     char[] R;
85     uint Rlen;
86     uint Rsize; // alloc'd size
87 
88     immutable len = str.length;
89 
90     R = buffer[];
91     Rsize = buffer.length;
92     Rlen = 0;
93 
94     for (k = 0; k != len; k++)
95     {
96         C = str[k];
97         // if (C in unescapedSet)
98         if (C < uri_flags.length && uri_flags[C] & unescapedSet)
99         {
100             if (Rlen == Rsize)
101             {
102                 char[] R2;
103 
104                 Rsize *= 2;
105                 R2 = new char[Rsize];
106                 R2[0 .. Rlen] = R[0 .. Rlen];
107                 R = R2;
108             }
109             R[Rlen] = cast(char) C;
110             Rlen++;
111         }
112         else
113         {
114             char[6] Octet;
115             uint L;
116 
117             V = C;
118 
119             // Transform V into octets
120             if (V <= 0x7F)
121             {
122                 Octet[0] = cast(char) V;
123                 L = 1;
124             }
125             else if (V <= 0x7FF)
126             {
127                 Octet[0] = cast(char)(0xC0 | (V >> 6));
128                 Octet[1] = cast(char)(0x80 | (V & 0x3F));
129                 L = 2;
130             }
131             else if (V <= 0xFFFF)
132             {
133                 Octet[0] = cast(char)(0xE0 | (V >> 12));
134                 Octet[1] = cast(char)(0x80 | ((V >> 6) & 0x3F));
135                 Octet[2] = cast(char)(0x80 | (V & 0x3F));
136                 L = 3;
137             }
138             else if (V <= 0x1FFFFF)
139             {
140                 Octet[0] = cast(char)(0xF0 | (V >> 18));
141                 Octet[1] = cast(char)(0x80 | ((V >> 12) & 0x3F));
142                 Octet[2] = cast(char)(0x80 | ((V >> 6) & 0x3F));
143                 Octet[3] = cast(char)(0x80 | (V & 0x3F));
144                 L = 4;
145             }
146             else
147             {
148                 throw new URIException("Undefined UTF-32 code point");
149             }
150 
151             if (Rlen + L * 3 > Rsize)
152             {
153                 char[] R2;
154 
155                 Rsize = 2 * (Rlen + L * 3);
156                 R2 = new char[Rsize];
157                 R2[0 .. Rlen] = R[0 .. Rlen];
158                 R = R2;
159             }
160 
161             for (j = 0; j < L; j++)
162             {
163                 R[Rlen] = '%';
164                 R[Rlen + 1] = hex2ascii[Octet[j] >> 4];
165                 R[Rlen + 2] = hex2ascii[Octet[j] & 15];
166 
167                 Rlen += 3;
168             }
169         }
170     }
171 
172     return R[0 .. Rlen].idup;
173 }
174 
175 @safe pure unittest
176 {
177     import std.exception : assertThrown;
178 
179     assert(URI_Encode("", 0) == "");
180     assert(URI_Encode(URI_Decode("%F0%BF%BF%BF", 0), 0) == "%F0%BF%BF%BF");
181     dstring a;
182     a ~= cast(dchar) 0xFFFFFFFF;
183     assertThrown(URI_Encode(a, 0));
184     assert(URI_Encode("aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", 0).length == 3 * 60);
185 }
186 
187 private uint ascii2hex(dchar c) @nogc @safe pure nothrow
188 {
189     return (c <= '9') ? c - '0' :
190         (c <= 'F') ? c - 'A' + 10 :
191         c - 'a' + 10;
192 }
193 
194 private dstring URI_Decode(Char)(scope const(Char)[] uri, uint reservedSet)
195 if (isSomeChar!Char)
196 {
197     import std.ascii : isHexDigit;
198 
199     uint j;
200     uint k;
201     uint V;
202     dchar C;
203 
204     uint Rlen;
205     immutable len = uri.length;
206     auto s = uri;
207 
208     auto Rsize = len;
209     dchar[] R = new dchar[Rsize];
210     Rlen = 0;
211 
212     for (k = 0; k != len; k++)
213     {
214         char B;
215         uint start;
216 
217         C = s[k];
218         if (C != '%')
219         {
220             R[Rlen] = C;
221             Rlen++;
222             continue;
223         }
224         start = k;
225         if (k + 2 >= len)
226             throw new URIException("Unexpected end of URI");
227         if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
228             throw new URIException("Expected two hexadecimal digits after '%'");
229         B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
230         k += 2;
231         if ((B & 0x80) == 0)
232         {
233             C = B;
234         }
235         else
236         {
237             uint n;
238 
239             for (n = 1; ; n++)
240             {
241                 if (n > 4)
242                     throw new URIException("UTF-32 code point size too large");
243                 if (((B << n) & 0x80) == 0)
244                 {
245                     if (n == 1)
246                         throw new URIException("UTF-32 code point size too small");
247                     break;
248                 }
249             }
250 
251             // Pick off (7 - n) significant bits of B from first byte of octet
252             V = B & ((1 << (7 - n)) - 1);   // (!!!)
253 
254             if (k + (3 * (n - 1)) >= len)
255                 throw new URIException("UTF-32 unaligned String");
256             for (j = 1; j != n; j++)
257             {
258                 k++;
259                 if (s[k] != '%')
260                     throw new URIException("Expected: '%'");
261                 if (!isHexDigit(s[k + 1]) || !isHexDigit(s[k + 2]))
262                     throw new URIException("Expected two hexadecimal digits after '%'");
263                 B = cast(char)((ascii2hex(s[k + 1]) << 4) + ascii2hex(s[k + 2]));
264                 if ((B & 0xC0) != 0x80)
265                     throw new URIException("Incorrect UTF-32 multi-byte sequence");
266                 k += 2;
267                 V = (V << 6) | (B & 0x3F);
268             }
269             if (V > 0x10FFFF)
270                 throw new URIException("Unknown UTF-32 code point");
271             C = V;
272         }
273         if (C < uri_flags.length && uri_flags[C] & reservedSet)
274         {
275             // R ~= s[start .. k + 1];
276             immutable width = (k + 1) - start;
277             for (int ii = 0; ii < width; ii++)
278                 R[Rlen + ii] = s[start + ii];
279             Rlen += width;
280         }
281         else
282         {
283             R[Rlen] = C;
284             Rlen++;
285         }
286     }
287     assert(Rlen <= Rsize);  // enforce our preallocation size guarantee
288 
289     // Copy array on stack to array in memory
290     return R[0 .. Rlen].idup;
291 }
292 
293 @safe pure unittest
294 {
295     import std.exception : assertThrown;
296 
297     assert(URI_Decode("", 0) == "");
298     assertThrown!URIException(URI_Decode("%", 0));
299     assertThrown!URIException(URI_Decode("%xx", 0));
300     assertThrown!URIException(URI_Decode("%FF", 0));
301     assertThrown!URIException(URI_Decode("%C0", 0));
302     assertThrown!URIException(URI_Decode("%C0000000", 0));
303     assertThrown!URIException(URI_Decode("%C0%xx0000", 0));
304     assertThrown!URIException(URI_Decode("%C0%C00000", 0));
305     assertThrown!URIException(URI_Decode("%F7%BF%BF%BF", 0));
306     assert(URI_Decode("%23", URI_Hash) == "%23");
307 }
308 
309 /*************************************
310  * Decodes the URI string encodedURI into a UTF-8 string and returns it.
311  * Escape sequences that resolve to reserved URI characters are not replaced.
312  * Escape sequences that resolve to the '#' character are not replaced.
313  */
314 string decode(Char)(scope const(Char)[] encodedURI)
315 if (isSomeChar!Char)
316 {
317     import std.algorithm.iteration : each;
318     import std.utf : encode;
319     auto s = URI_Decode(encodedURI, URI_Reserved | URI_Hash);
320     char[] r;
321     s.each!(c => encode(r, c));
322     return r;
323 }
324 
325 ///
326 @safe unittest
327 {
328     assert("foo%20bar".decode == "foo bar");
329     assert("%3C%3E.@.%E2%84%A2".decode == "<>.@.™");
330     assert("foo&/".decode == "foo&/");
331     assert("!@#$&*(".decode == "!@#$&*(");
332 }
333 
334 /*******************************
335  * Decodes the URI string encodedURI into a UTF-8 string and returns it. All
336  * escape sequences are decoded.
337  */
338 string decodeComponent(Char)(scope const(Char)[] encodedURIComponent)
339 if (isSomeChar!Char)
340 {
341     import std.algorithm.iteration : each;
342     import std.utf : encode;
343     auto s = URI_Decode(encodedURIComponent, 0);
344     char[] r;
345     s.each!(c => encode(r, c));
346     return r;
347 }
348 
349 ///
350 @safe unittest
351 {
352     assert("foo%2F%26".decodeComponent == "foo/&");
353     assert("dl%C3%A4ng%20r%C3%B6cks".decodeComponent == "dläng röcks");
354     assert("!%40%23%24%25%5E%26*(".decodeComponent == "!@#$%^&*(");
355 }
356 
357 /*****************************
358  * Encodes the UTF-8 string uri into a URI and returns that URI. Any character
359  * not a valid URI character is escaped. The '#' character is not escaped.
360  */
361 string encode(Char)(scope const(Char)[] uri)
362 if (isSomeChar!Char)
363 {
364     import std.utf : toUTF32;
365     auto s = toUTF32(uri);
366     return URI_Encode(s, URI_Reserved | URI_Hash | URI_Alpha | URI_Digit | URI_Mark);
367 }
368 
369 ///
370 @safe unittest
371 {
372     assert("foo bar".encode == "foo%20bar");
373     assert("<>.@.™".encode == "%3C%3E.@.%E2%84%A2");
374     assert("foo/#?a=1&b=2".encode == "foo/#?a=1&b=2");
375     assert("dlang+rocks!".encode == "dlang+rocks!");
376     assert("!@#$%^&*(".encode == "!@#$%25%5E&*(");
377 }
378 
379 /********************************
380  * Encodes the UTF-8 string uriComponent into a URI and returns that URI.
381  * Any character not a letter, digit, or one of -_.!~*'() is escaped.
382  */
383 string encodeComponent(Char)(scope const(Char)[] uriComponent)
384 if (isSomeChar!Char)
385 {
386     import std.utf : toUTF32;
387     auto s = toUTF32(uriComponent);
388     return URI_Encode(s, URI_Alpha | URI_Digit | URI_Mark);
389 }
390 
391 ///
392 @safe unittest
393 {
394     assert("!@#$%^&*(".encodeComponent == "!%40%23%24%25%5E%26*(");
395     assert("<>.@.™".encodeComponent == "%3C%3E.%40.%E2%84%A2");
396     assert("foo/&".encodeComponent == "foo%2F%26");
397     assert("dläng röcks".encodeComponent == "dl%C3%A4ng%20r%C3%B6cks");
398     assert("dlang+rocks!".encodeComponent == "dlang%2Brocks!");
399 }
400 
401 /* Encode associative array using www-form-urlencoding
402  *
403  * Params:
404  *      values = an associative array containing the values to be encoded.
405  *
406  * Returns:
407  *      A string encoded using www-form-urlencoding.
408  */
409 package string urlEncode(scope string[string] values) @safe pure
410 {
411     if (values.length == 0)
412         return "";
413 
414     import std.array : Appender;
415     import std.format.write : formattedWrite;
416 
417     Appender!string enc;
418     enc.reserve(values.length * 128);
419 
420     bool first = true;
421     foreach (k, v; values)
422     {
423         if (!first)
424             enc.put('&');
425         formattedWrite(enc, "%s=%s", encodeComponent(k), encodeComponent(v));
426         first = false;
427     }
428     return enc.data;
429 }
430 
431 @safe pure unittest
432 {
433     // @system because urlEncode -> encodeComponent -> URI_Encode
434     // URI_Encode uses alloca and pointer slicing
435     string[string] a;
436     assert(urlEncode(a) == "");
437     assert(urlEncode(["name1" : "value1"]) == "name1=value1");
438     auto enc = urlEncode(["name1" : "value1", "name2" : "value2"]);
439     assert(enc == "name1=value1&name2=value2" || enc == "name2=value2&name1=value1");
440 }
441 
442 /***************************
443  * Does string s[] start with a URL?
444  * Returns:
445  *  -1   it does not
446  *  len  it does, and s[0 .. len] is the slice of s[] that is that URL
447  */
448 
449 ptrdiff_t uriLength(Char)(scope const(Char)[] s)
450 if (isSomeChar!Char)
451 {
452     /* Must start with one of:
453      *  http://
454      *  https://
455      *  www.
456      */
457     import std.ascii : isAlphaNum;
458     import std.uni : icmp;
459 
460     ptrdiff_t i;
461 
462     if (s.length <= 4)
463         return -1;
464 
465     if (s.length > 7 && icmp(s[0 .. 7], "http://") == 0)
466     {
467         i = 7;
468     }
469     else
470     {
471         if (s.length > 8 && icmp(s[0 .. 8], "https://") == 0)
472             i = 8;
473         else
474             return -1;
475     }
476 
477     ptrdiff_t lastdot;
478     for (; i < s.length; i++)
479     {
480         auto c = s[i];
481         if (isAlphaNum(c))
482             continue;
483         if (c == '-' || c == '_' || c == '?' ||
484                 c == '=' || c == '%' || c == '&' ||
485                 c == '/' || c == '+' || c == '#' ||
486                 c == '~' || c == '$')
487             continue;
488         if (c == '.')
489         {
490             lastdot = i;
491             continue;
492         }
493         break;
494     }
495     if (!lastdot)
496         return -1;
497 
498     return i;
499 }
500 
501 ///
502 @safe pure unittest
503 {
504     string s1 = "http://www.digitalmars.com/~fred/fredsRX.html#foo end!";
505     assert(uriLength(s1) == 49);
506     string s2 = "no uri here";
507     assert(uriLength(s2) == -1);
508     assert(uriLength("issue 14924") < 0);
509 }
510 
511 @safe pure nothrow @nogc unittest
512 {
513     assert(uriLength("") == -1);
514     assert(uriLength("https://www") == -1);
515 }
516 
517 /***************************
518  * Does string s[] start with an email address?
519  * Returns:
520  *  -1    it does not
521  *  len   it does, and s[0 .. i] is the slice of s[] that is that email address
522  * References:
523  *  RFC2822
524  */
525 ptrdiff_t emailLength(Char)(scope const(Char)[] s)
526 if (isSomeChar!Char)
527 {
528     import std.ascii : isAlpha, isAlphaNum;
529 
530     ptrdiff_t i;
531 
532     if (s.length == 0)
533         return -1;
534 
535     if (!isAlpha(s[0]))
536         return -1;
537 
538     for (i = 1; 1; i++)
539     {
540         if (i == s.length)
541             return -1;
542         auto c = s[i];
543         if (isAlphaNum(c))
544             continue;
545         if (c == '-' || c == '_' || c == '.')
546             continue;
547         if (c != '@')
548             return -1;
549         i++;
550         break;
551     }
552 
553     /* Now do the part past the '@'
554      */
555     ptrdiff_t lastdot;
556     for (; i < s.length; i++)
557     {
558         auto c = s[i];
559         if (isAlphaNum(c))
560             continue;
561         if (c == '-' || c == '_')
562             continue;
563         if (c == '.')
564         {
565             lastdot = i;
566             continue;
567         }
568         break;
569     }
570     if (!lastdot || (i - lastdot != 3 && i - lastdot != 4))
571         return -1;
572 
573     return i;
574 }
575 
576 ///
577 @safe pure unittest
578 {
579     string s1 = "my.e-mail@www.example-domain.com with garbage added";
580     assert(emailLength(s1) == 32);
581     string s2 = "no email address here";
582     assert(emailLength(s2) == -1);
583     assert(emailLength("issue 14924") < 0);
584 }
585 
586 @safe pure unittest
587 {
588     //@system because of encode -> URI_Encode
589     debug(uri) writeln("uri.encodeURI.unittest");
590 
591     string source = "http://www.digitalmars.com/~fred/fred's RX.html#foo";
592     string target = "http://www.digitalmars.com/~fred/fred's%20RX.html#foo";
593 
594     auto result = encode(source);
595     debug(uri) writefln("result = '%s'", result);
596     assert(result == target);
597     result = decode(target);
598     debug(uri) writefln("result = '%s'", result);
599     assert(result == source);
600 
601     result = encode(decode("%E3%81%82%E3%81%82"));
602     assert(result == "%E3%81%82%E3%81%82");
603 
604     result = encodeComponent("c++");
605     assert(result == "c%2B%2B");
606 
607     auto str = new char[10_000_000];
608     str[] = 'A';
609     result = encodeComponent(str);
610     foreach (char c; result)
611         assert(c == 'A');
612 
613     result = decode("%41%42%43");
614     debug(uri) writeln(result);
615 
616     import std.meta : AliasSeq;
617     static foreach (StringType; AliasSeq!(char[], wchar[], dchar[], string, wstring, dstring))
618     {{
619         import std.conv : to;
620         StringType decoded1 = source.to!StringType;
621         string encoded1 = encode(decoded1);
622         assert(decoded1 == source.to!StringType); // check that `decoded1` wasn't changed
623         assert(encoded1 == target);
624         assert(decoded1 == decode(encoded1).to!StringType);
625 
626         StringType encoded2 = target.to!StringType;
627         string decoded2 = decode(encoded2);
628         assert(encoded2 == target.to!StringType); // check that `encoded2` wasn't changed
629         assert(decoded2 == source);
630         assert(encoded2 == encode(decoded2).to!StringType);
631     }}
632 }
633 
634 @safe pure nothrow @nogc unittest
635 {
636     assert(emailLength("") == -1);
637     assert(emailLength("@") == -1);
638     assert(emailLength("abcd") == -1);
639     assert(emailLength("blah@blub") == -1);
640     assert(emailLength("blah@blub.") == -1);
641     assert(emailLength("blah@blub.domain") == -1);
642 }