View Javadoc
1   /* 
2    * Licensed under the Apache License, Version 2.0 (the "License");
3    * you may not use this file except in compliance with the License.
4    * You may obtain a copy of the License at
5    *
6    * http://www.apache.org/licenses/LICENSE-2.0
7    *
8    * Unless required by applicable law or agreed to in writing, software
9    * distributed under the License is distributed on an "AS IS" BASIS,
10   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11   * See the License for the specific language governing permissions and
12   * limitations under the License.
13   *
14   */
15  
16  package org.esigate.impl;
17  
18  import java.net.URI;
19  import java.util.regex.Matcher;
20  import java.util.regex.Pattern;
21  
22  import org.apache.commons.lang3.StringEscapeUtils;
23  import org.apache.commons.lang3.StringUtils;
24  import org.esigate.util.UriUtils;
25  import org.slf4j.Logger;
26  import org.slf4j.LoggerFactory;
27  
28  /**
29   * "fixes" links to resources, images and pages in pages retrieved by esigate :
30   * <ul>
31   * <li>Current-path-relative urls are converted to full path relative urls ( img/test.img -&gt;
32   * /myapp/curentpath/img/test.img)</li>
33   * <li>All relative urls can be converted to absolute urls (including server name)</li>
34   * </ul>
35   * 
36   * This enables use of esigate without any special modifications of the generated urls on the provider side.
37   * 
38   * All href and src attributes are processed, except javascript links.
39   * 
40   * @author Nicolas Richeton
41   * 
42   */
43  public class UrlRewriter {
44      private static final Logger LOG = LoggerFactory.getLogger(UrlRewriter.class);
45  
46      private static final Pattern URL_PATTERN = Pattern.compile(
47              "<([^\\!:>]+)(src|href|action|background|content)\\s*=\\s*('[^<']*'|\"[^<\"]*\")([^>]*)>",
48              Pattern.CASE_INSENSITIVE);
49  
50      private static final Pattern JAVASCRIPT_CONCATENATION_PATTERN = Pattern.compile(
51              "\\+\\s*'|\\+\\s*\"|'\\s*\\+|\"\\s*\\+", Pattern.CASE_INSENSITIVE);
52  
53      private static final Pattern META_REFRESH_PATTERN = Pattern.compile(
54              "<\\s*meta([^>]+)http-equiv\\s*=\\s*(\"|')refresh(\"|')", Pattern.CASE_INSENSITIVE);
55  
56      /**
57       * Rewrites urls from the response for the client or from the request to the target server.
58       * 
59       * If mode is ABSOLUTE, all relative urls will be replaced by the full urls :
60       * <ul>
61       * <li>images/image.png is replaced by http://server/context/images/image.png</li>;
62       * <li>/context/images/image.png is replaced by http://server/context/images/image.png</li>;
63       * </ul>
64       * 
65       * If mode is RELATIVE, context will be added to relative urls :
66       * <ul>
67       * <li>images/image.png is replaced by /context/images/image.png</li>
68       * </ul>
69       * 
70       * 
71       */
72      public UrlRewriter() {
73      }
74  
75      /**
76       * Fixes a referer url in a request.
77       * 
78       * @param referer
79       *            the url to fix (can be anything found in an html page, relative, absolute, empty...)
80       * @param baseUrl
81       *            The base URL selected for this request.
82       * @param visibleBaseUrl
83       *            The base URL viewed by the browser.
84       * 
85       * @return the fixed url.
86       */
87      public String rewriteReferer(String referer, String baseUrl, String visibleBaseUrl) {
88          URI uri = UriUtils.createURI(referer);
89  
90          // Base url should end with /
91          if (!baseUrl.endsWith("/")) {
92              baseUrl = baseUrl + "/";
93          }
94          URI baseUri = UriUtils.createURI(baseUrl);
95  
96          // If no visible url base is defined, use base url as visible base url
97          if (!visibleBaseUrl.endsWith("/")) {
98              visibleBaseUrl = visibleBaseUrl + "/";
99          }
100         URI visibleBaseUri = UriUtils.createURI(visibleBaseUrl);
101 
102         // Relativize url to visible base url
103         URI relativeUri = visibleBaseUri.relativize(uri);
104         // If the url is unchanged do nothing
105         if (relativeUri.equals(uri)) {
106             LOG.debug("url kept unchanged: [{}]", referer);
107             return referer;
108         }
109         // Else rewrite replacing baseUrl by visibleBaseUrl
110         URI result = baseUri.resolve(relativeUri);
111         LOG.debug("referer fixed: [{}] -> [{}]", referer, result);
112         return result.toString();
113     }
114 
115     /**
116      * Fixes an url according to the chosen mode.
117      * <p>
118      * Note: urls starting with an ESI variable are not rewriten.
119      * 
120      * @param url
121      *            the url to fix (can be anything found in an html page, relative, absolute, empty...)
122      * @param requestUrl
123      *            The incoming request URL (could be absolute or relative to visible base url).
124      * @param baseUrl
125      *            The base URL selected for this request.
126      * @param visibleBaseUrl
127      *            The base URL viewed by the browser.
128      * @param absolute
129      *            Should the rewritten urls contain the scheme host and port
130      * 
131      * @return the fixed url.
132      */
133     public String rewriteUrl(String url, String requestUrl, String baseUrl, String visibleBaseUrl, boolean absolute) {
134 
135         // Do not rewrite Urls starting with ESI variables
136         // This could be improved by detecting we are in an 'esi:vars' block,
137         // but this would link the rewriter with ESI parsing.
138         if (url.startsWith("$(")) {
139             return url;
140         }
141 
142         // Base url should end with /
143         if (!baseUrl.endsWith("/")) {
144             baseUrl = baseUrl + "/";
145         }
146         URI baseUri = UriUtils.createURI(baseUrl);
147 
148         // If no visible url base is defined, use base url as visible base url
149         if (!visibleBaseUrl.endsWith("/")) {
150             visibleBaseUrl = visibleBaseUrl + "/";
151         }
152         URI visibleBaseUri = UriUtils.createURI(visibleBaseUrl);
153 
154         // Build the absolute Uri of the request sent to the backend
155         URI requestUri;
156         if (requestUrl.startsWith(visibleBaseUrl)) {
157             requestUri = UriUtils.createURI(requestUrl);
158         } else {
159             requestUri = UriUtils.concatPath(baseUri, requestUrl);
160         }
161 
162         // Interpret the url relatively to the request url (may be relative)
163         URI uri = UriUtils.resolve(url, requestUri);
164         // Normalize the path (remove . or .. if possible)
165         uri = uri.normalize();
166 
167         // Try to relativize url to base url
168         URI relativeUri = baseUri.relativize(uri);
169         // If the url is unchanged do nothing
170         if (relativeUri.equals(uri)) {
171             LOG.debug("url kept unchanged: [{}]", url);
172             return url;
173         }
174         // Else rewrite replacing baseUrl by visibleBaseUrl
175         URI result = visibleBaseUri.resolve(relativeUri);
176         // If mode relative, remove all the scheme://host:port to keep only a url relative to server root (starts with
177         // "/")
178         if (!absolute) {
179             result = UriUtils.removeServer(result);
180         }
181         LOG.debug("url fixed: [{}] -> [{}]", url, result);
182         return result.toString();
183     }
184 
185     /**
186      * Fixes all resources urls and returns the result.
187      * 
188      * @param input
189      *            The html to be processed.
190      * 
191      * @param requestUrl
192      *            The request URL.
193      * @param baseUrlParam
194      *            The base URL selected for this request.
195      * @param visibleBaseUrl
196      *            The base URL viewed by the browser.
197      * @param absolute
198      *            Should the rewritten urls contain the scheme host and port
199      * 
200      * @return the result of this renderer.
201      */
202     public CharSequence rewriteHtml(CharSequence input, String requestUrl, String baseUrlParam, String visibleBaseUrl,
203             boolean absolute) {
204         StringBuffer result = new StringBuffer(input.length());
205         Matcher m = URL_PATTERN.matcher(input);
206         while (m.find()) {
207             String url = input.subSequence(m.start(3) + 1, m.end(3) - 1).toString();
208             String tag = m.group(0);
209             String quote = input.subSequence(m.end(3) - 1, m.end(3)).toString();
210 
211             // Browsers tolerate urls with white spaces before or after
212             String trimmedUrl = StringUtils.trim(url);
213 
214             String rewrittenUrl = url;
215 
216             trimmedUrl = unescapeHtml(trimmedUrl);
217 
218             if (trimmedUrl.isEmpty()) {
219                 LOG.debug("empty url kept unchanged");
220             } else if (trimmedUrl.startsWith("#")) {
221                 LOG.debug("anchor url kept unchanged: [{}]", url);
222             } else if (JAVASCRIPT_CONCATENATION_PATTERN.matcher(trimmedUrl).find()) {
223                 LOG.debug("url in javascript kept unchanged: [{}]", url);
224             } else if (m.group(2).equalsIgnoreCase("content")) {
225                 if (META_REFRESH_PATTERN.matcher(tag).find()) {
226                     rewrittenUrl = rewriteRefresh(trimmedUrl, requestUrl, baseUrlParam, visibleBaseUrl);
227                     rewrittenUrl = escapeHtml(rewrittenUrl);
228                     LOG.debug("refresh url [{}] rewritten [{}]", url, rewrittenUrl);
229                 } else {
230                     LOG.debug("content attribute kept unchanged: [{}]", url);
231                 }
232             } else {
233                 rewrittenUrl = rewriteUrl(trimmedUrl, requestUrl, baseUrlParam, visibleBaseUrl, absolute);
234                 rewrittenUrl = escapeHtml(rewrittenUrl);
235                 LOG.debug("url [{}] rewritten [{}]", url, rewrittenUrl);
236             }
237 
238             m.appendReplacement(result, ""); // Copy what is between the previous match and the current match
239             result.append("<");
240             result.append(m.group(1));
241             result.append(m.group(2));
242             result.append("=");
243             result.append(quote);
244             result.append(rewrittenUrl);
245             result.append(quote);
246             if (m.groupCount() > 3) {
247                 result.append(m.group(4));
248             }
249             result.append(">");
250         }
251 
252         m.appendTail(result); // Copy the reminder of the input
253 
254         return result;
255     }
256 
257     private String unescapeHtml(String url) {
258         // Unescape entities, ex: &apos; or &#39;
259         url = StringEscapeUtils.unescapeHtml4(url);
260         return url;
261     }
262 
263     private String escapeHtml(String url) {
264         // Escape the previously unescaped characters
265         url = StringEscapeUtils.escapeHtml4(url);
266         // Replace " by &quot; in order not to break the html
267         url = url.replaceAll("'", "&apos;");
268         url = url.replaceAll("\"", "&quot;");
269         return url;
270     }
271 
272     /**
273      * Rewrites a "Refresh" HTTP header or a &lt;meta http-equiv="refresh"... tag. The value should have the following
274      * format:
275      * 
276      * Refresh: 5; url=http://www.example.com
277      * 
278      * @param input
279      *            The refresh value to be rewritten.
280      * @param requestUrl
281      *            The request URL.
282      * @param baseUrl
283      *            The base URL selected for this request.
284      * @param visibleBaseUrl
285      *            The base URL viewed by the browser.
286      * @return the rewritten refresh value
287      */
288     public String rewriteRefresh(String input, String requestUrl, String baseUrl, String visibleBaseUrl) {
289         // Header has the following format
290         // Refresh: 5; url=http://www.w3.org/pub/WWW/People.html
291         int urlPosition = input.indexOf("url=");
292         if (urlPosition >= 0) {
293             String urlValue = input.substring(urlPosition + "url=".length());
294             String targetUrlValue = rewriteUrl(urlValue, requestUrl, baseUrl, visibleBaseUrl, true);
295             return input.substring(0, urlPosition) + "url=" + targetUrlValue;
296         } else {
297             return input;
298         }
299     }
300 }