1: <?php
2:
3: /**
4: * HTML Purifier's internal representation of a URI.
5: * @note
6: * Internal data-structures are completely escaped. If the data needs
7: * to be used in a non-URI context (which is very unlikely), be sure
8: * to decode it first. The URI may not necessarily be well-formed until
9: * validate() is called.
10: */
11: class HTMLPurifier_URI
12: {
13:
14: public $scheme, $userinfo, $host, $port, $path, $query, $fragment;
15:
16: /**
17: * @note Automatically normalizes scheme and port
18: */
19: public function __construct($scheme, $userinfo, $host, $port, $path, $query, $fragment) {
20: $this->scheme = is_null($scheme) || ctype_lower($scheme) ? $scheme : strtolower($scheme);
21: $this->userinfo = $userinfo;
22: $this->host = $host;
23: $this->port = is_null($port) ? $port : (int) $port;
24: $this->path = $path;
25: $this->query = $query;
26: $this->fragment = $fragment;
27: }
28:
29: /**
30: * Retrieves a scheme object corresponding to the URI's scheme/default
31: * @param $config Instance of HTMLPurifier_Config
32: * @param $context Instance of HTMLPurifier_Context
33: * @return Scheme object appropriate for validating this URI
34: */
35: public function getSchemeObj($config, $context) {
36: $registry = HTMLPurifier_URISchemeRegistry::instance();
37: if ($this->scheme !== null) {
38: $scheme_obj = $registry->getScheme($this->scheme, $config, $context);
39: if (!$scheme_obj) return false; // invalid scheme, clean it out
40: } else {
41: // no scheme: retrieve the default one
42: $def = $config->getDefinition('URI');
43: $scheme_obj = $registry->getScheme($def->defaultScheme, $config, $context);
44: if (!$scheme_obj) {
45: // something funky happened to the default scheme object
46: trigger_error(
47: 'Default scheme object "' . $def->defaultScheme . '" was not readable',
48: E_USER_WARNING
49: );
50: return false;
51: }
52: }
53: return $scheme_obj;
54: }
55:
56: /**
57: * Generic validation method applicable for all schemes. May modify
58: * this URI in order to get it into a compliant form.
59: * @param $config Instance of HTMLPurifier_Config
60: * @param $context Instance of HTMLPurifier_Context
61: * @return True if validation/filtering succeeds, false if failure
62: */
63: public function validate($config, $context) {
64:
65: // ABNF definitions from RFC 3986
66: $chars_sub_delims = '!$&\'()*+,;=';
67: $chars_gen_delims = ':/?#[]@';
68: $chars_pchar = $chars_sub_delims . ':@';
69:
70: // validate host
71: if (!is_null($this->host)) {
72: $host_def = new HTMLPurifier_AttrDef_URI_Host();
73: $this->host = $host_def->validate($this->host, $config, $context);
74: if ($this->host === false) $this->host = null;
75: }
76:
77: // validate scheme
78: // NOTE: It's not appropriate to check whether or not this
79: // scheme is in our registry, since a URIFilter may convert a
80: // URI that we don't allow into one we do. So instead, we just
81: // check if the scheme can be dropped because there is no host
82: // and it is our default scheme.
83: if (!is_null($this->scheme) && is_null($this->host) || $this->host === '') {
84: // support for relative paths is pretty abysmal when the
85: // scheme is present, so axe it when possible
86: $def = $config->getDefinition('URI');
87: if ($def->defaultScheme === $this->scheme) {
88: $this->scheme = null;
89: }
90: }
91:
92: // validate username
93: if (!is_null($this->userinfo)) {
94: $encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . ':');
95: $this->userinfo = $encoder->encode($this->userinfo);
96: }
97:
98: // validate port
99: if (!is_null($this->port)) {
100: if ($this->port < 1 || $this->port > 65535) $this->port = null;
101: }
102:
103: // validate path
104: $path_parts = array();
105: $segments_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/');
106: if (!is_null($this->host)) { // this catches $this->host === ''
107: // path-abempty (hier and relative)
108: // http://www.example.com/my/path
109: // //www.example.com/my/path (looks odd, but works, and
110: // recognized by most browsers)
111: // (this set is valid or invalid on a scheme by scheme
112: // basis, so we'll deal with it later)
113: // file:///my/path
114: // ///my/path
115: $this->path = $segments_encoder->encode($this->path);
116: } elseif ($this->path !== '') {
117: if ($this->path[0] === '/') {
118: // path-absolute (hier and relative)
119: // http:/my/path
120: // /my/path
121: if (strlen($this->path) >= 2 && $this->path[1] === '/') {
122: // This could happen if both the host gets stripped
123: // out
124: // http://my/path
125: // //my/path
126: $this->path = '';
127: } else {
128: $this->path = $segments_encoder->encode($this->path);
129: }
130: } elseif (!is_null($this->scheme)) {
131: // path-rootless (hier)
132: // http:my/path
133: // Short circuit evaluation means we don't need to check nz
134: $this->path = $segments_encoder->encode($this->path);
135: } else {
136: // path-noscheme (relative)
137: // my/path
138: // (once again, not checking nz)
139: $segment_nc_encoder = new HTMLPurifier_PercentEncoder($chars_sub_delims . '@');
140: $c = strpos($this->path, '/');
141: if ($c !== false) {
142: $this->path =
143: $segment_nc_encoder->encode(substr($this->path, 0, $c)) .
144: $segments_encoder->encode(substr($this->path, $c));
145: } else {
146: $this->path = $segment_nc_encoder->encode($this->path);
147: }
148: }
149: } else {
150: // path-empty (hier and relative)
151: $this->path = ''; // just to be safe
152: }
153:
154: // qf = query and fragment
155: $qf_encoder = new HTMLPurifier_PercentEncoder($chars_pchar . '/?');
156:
157: if (!is_null($this->query)) {
158: $this->query = $qf_encoder->encode($this->query);
159: }
160:
161: if (!is_null($this->fragment)) {
162: $this->fragment = $qf_encoder->encode($this->fragment);
163: }
164:
165: return true;
166:
167: }
168:
169: /**
170: * Convert URI back to string
171: * @return String URI appropriate for output
172: */
173: public function toString() {
174: // reconstruct authority
175: $authority = null;
176: // there is a rendering difference between a null authority
177: // (http:foo-bar) and an empty string authority
178: // (http:///foo-bar).
179: if (!is_null($this->host)) {
180: $authority = '';
181: if(!is_null($this->userinfo)) $authority .= $this->userinfo . '@';
182: $authority .= $this->host;
183: if(!is_null($this->port)) $authority .= ':' . $this->port;
184: }
185:
186: // Reconstruct the result
187: // One might wonder about parsing quirks from browsers after
188: // this reconstruction. Unfortunately, parsing behavior depends
189: // on what *scheme* was employed (file:///foo is handled *very*
190: // differently than http:///foo), so unfortunately we have to
191: // defer to the schemes to do the right thing.
192: $result = '';
193: if (!is_null($this->scheme)) $result .= $this->scheme . ':';
194: if (!is_null($authority)) $result .= '//' . $authority;
195: $result .= $this->path;
196: if (!is_null($this->query)) $result .= '?' . $this->query;
197: if (!is_null($this->fragment)) $result .= '#' . $this->fragment;
198:
199: return $result;
200: }
201:
202: }
203:
204: // vim: et sw=4 sts=4
205: