View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.servlet;
19  
20  import org.apache.any23.configuration.DefaultConfiguration;
21  import org.apache.any23.extractor.ExtractionParameters;
22  import org.apache.any23.http.HTTPClient;
23  import org.apache.any23.servlet.conneg.Any23Negotiator;
24  import org.apache.any23.servlet.conneg.MediaRangeSpec;
25  import org.apache.any23.source.ByteArrayDocumentSource;
26  import org.apache.any23.source.DocumentSource;
27  import org.apache.any23.source.HTTPDocumentSource;
28  import org.apache.any23.source.StringDocumentSource;
29  import org.openrdf.rio.RDFFormat;
30  
31  import javax.servlet.ServletException;
32  import javax.servlet.http.HttpServlet;
33  import javax.servlet.http.HttpServletRequest;
34  import javax.servlet.http.HttpServletResponse;
35  import java.io.IOException;
36  import java.net.URI;
37  import java.net.URISyntaxException;
38  import java.util.regex.Pattern;
39  
40  import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
41  
42  /**
43   * A <i>Servlet</i> that fetches a client-specified <i>URI</i>,
44   * RDFizes the content, and returns it in a format chosen by the client.
45   *
46   * @author Gabriele Renzi
47   * @author Richard Cyganiak (richard@cyganiak.de)
48   */
49  public class Servlet extends HttpServlet {
50  
51      public static final String DEFAULT_BASE_URI = "http://any23.org/tmp/";
52  
53      private static final long serialVersionUID = 8207685628715421336L;
54  
55      // RFC 3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
56      private final static Pattern schemeRegex =
57              Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:");
58  
59      @Override
60      protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws IOException, ServletException {
61          final WebResponder responder = new WebResponder(this, resp);
62          final String format = getFormatFromRequestOrNegotiation(req);
63          final boolean report = isReport(req);
64          final boolean annotate = isAnnotated(req);
65          if (format == null) {
66              responder.sendError(406, "Client accept header does not include a supported output format", report);
67              return;
68          }
69          final String uri = getInputURIFromRequest(req);
70          if (uri == null) {
71              responder.sendError(404, "Missing URI in GET request. Try /format/http://example.com/myfile", report);
72              return;
73          }
74          final ExtractionParameters eps = getExtractionParameters(req);
75          responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
76      }
77  
78      @Override
79      protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws IOException {
80          final WebResponder responder = new WebResponder(this, resp);
81          final boolean report = isReport(req);
82          final boolean annotate = isAnnotated(req);
83          if (req.getContentType() == null) {
84              responder.sendError(400, "Invalid POST request, no Content-Type for the message body specified", report);
85              return;
86          }
87          final String uri = getInputURIFromRequest(req);
88          final String format = getFormatFromRequestOrNegotiation(req);
89          if (format == null) {
90              responder.sendError(406, "Client accept header does not include a supported output format", report);
91              return;
92          }
93          final ExtractionParameters eps = getExtractionParameters(req);
94          if ("application/x-www-form-urlencoded".equals(getContentTypeHeader(req))) {
95              if (uri != null) {
96                  log("Attempting conversion to '" + format + "' from URI <" + uri + ">");
97                  responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
98                  return;
99              }
100             if (req.getParameter("body") == null) {
101                 responder.sendError(400, "Invalid POST request, parameter 'uri' or 'body' required", report);
102                 return;
103             }
104             String type = null;
105             if (req.getParameter("type") != null && !"".equals(req.getParameter("type"))) {
106                 type = req.getParameter("type");
107             }
108             log("Attempting conversion to '" + format + "' from body parameter");
109             responder.runExtraction(
110                     new StringDocumentSource(req.getParameter("body"), Servlet.DEFAULT_BASE_URI, type),
111                     eps,
112                     format,
113                     report, annotate
114             );
115             return;
116         }
117         log("Attempting conversion to '" + format + "' from POST body");
118         responder.runExtraction(
119                 new ByteArrayDocumentSource(
120                         req.getInputStream(),
121                         Servlet.DEFAULT_BASE_URI,
122                         getContentTypeHeader(req)
123                 ),
124                 eps,
125                 format,
126                 report, annotate
127         );
128     }
129 
130     private String getFormatFromRequestOrNegotiation(HttpServletRequest request) {
131         String fromRequest = getFormatFromRequest(request);
132         if (fromRequest != null && !"".equals(fromRequest) && !"best".equals(fromRequest)) {
133             return fromRequest;
134         }
135         MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept"));
136         if (result == null) {
137             return null;
138         }
139         else if (RDFFormat.TURTLE.hasMIMEType(result.getMediaType())) {
140             return "turtle";
141         }
142         else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) {
143             return "n3";
144         }
145         else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) {
146             return "nq";
147         }
148         else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) {
149             return "rdf";
150         }
151         else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) {
152             return "nt";
153         }
154         else {
155             return "turtle";    // shouldn't happen
156         }
157     }
158 
159     private String getFormatFromRequest(HttpServletRequest request) {
160         if (request.getPathInfo() == null) return "best";
161         String[] args = request.getPathInfo().split("/", 3);
162         if (args.length < 2 || "".equals(args[1])) {
163             if (request.getParameter("format") == null) {
164                 return "best";
165             } else {
166                 return request.getParameter("format");
167             }
168         }
169         return args[1];
170     }
171 
172     private String getInputURIFromRequest(HttpServletRequest request) {
173         if (request.getPathInfo() == null) return null;
174         String[] args = request.getPathInfo().split("/", 3);
175         if (args.length < 3) {
176             if (request.getParameter("uri") != null) {
177                 return request.getParameter("uri").trim();
178             }
179             if (request.getParameter("url") != null) {
180                 return request.getParameter("url").trim();
181             }
182             return null;
183         }
184         String uri = args[2];
185         if (request.getQueryString() != null) {
186             uri = uri + "?" + request.getQueryString();
187         }
188         if (!hasScheme(uri)) {
189             uri = "http://" + uri;
190         } else if (hasOnlySingleSlashAfterScheme(uri)) {
191             // This is to work around an issue where Tomcat 6.0.18 is
192             // too smart for us. Tomcat normalizes double-slashes in
193             // the path, and thus turns "http://" into "http:/" if it
194             // occurs in the path. So we restore the double slash.
195             uri = uri.replaceFirst(":/", "://");
196         }
197         return uri.trim();
198     }
199 
200 
201     private boolean hasScheme(String uri) {
202         return schemeRegex.matcher(uri).find();
203     }
204 
205     private final static Pattern schemeAndSingleSlashRegex =
206             Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:/[^/]");
207 
208     private boolean hasOnlySingleSlashAfterScheme(String uri) {
209         return schemeAndSingleSlashRegex.matcher(uri).find();
210     }
211 
212     private String getContentTypeHeader(HttpServletRequest req) {
213         if (req.getHeader("Content-Type") == null) return null;
214         if ("".equals(req.getHeader("Content-Type"))) return null;
215         String contentType = req.getHeader("Content-Type");
216         // strip off parameters such as ";charset=UTF-8"
217         int index = contentType.indexOf(";");
218         if (index == -1) return contentType;
219         return contentType.substring(0, index);
220     }
221 
222     private DocumentSource createHTTPDocumentSource(WebResponder responder, String uri, boolean report)
223     throws IOException {
224         try {
225             if (!isValidURI(uri)) {
226                 throw new URISyntaxException(uri, "@@@");
227             }
228             return createHTTPDocumentSource(responder.getRunner().getHTTPClient(), uri);
229         } catch (URISyntaxException ex) {
230             responder.sendError(400, "Invalid input URI " + uri, report);
231             return null;
232         }
233     }
234 
235     protected DocumentSource createHTTPDocumentSource(HTTPClient httpClient, String uri)
236             throws IOException, URISyntaxException {
237         return new HTTPDocumentSource(httpClient, uri);
238     }
239 
240     private boolean isValidURI(String s) {
241         try {
242             URI uri = new URI(s);
243             if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) {
244                 return false;
245             }
246         } catch (URISyntaxException e) {
247             return false;
248         }
249         return true;
250     }
251 
252     private ValidationMode getValidationMode(HttpServletRequest request) {
253         final String PARAMETER = "validation-mode";
254         final String validationMode = request.getParameter(PARAMETER);
255         if(validationMode == null) return ValidationMode.None;
256         if("none".equalsIgnoreCase(validationMode)) return ValidationMode.None;
257         if("validate".equalsIgnoreCase(validationMode)) return ValidationMode.Validate;
258         if("validate-fix".equalsIgnoreCase(validationMode)) return ValidationMode.ValidateAndFix;
259         throw new IllegalArgumentException(
260                 String.format("Invalid value '%s' for '%s' parameter.", validationMode, PARAMETER)
261         );
262     }
263     
264     private ExtractionParameters getExtractionParameters(HttpServletRequest request) {
265         final ValidationMode mode = getValidationMode(request);
266         return new ExtractionParameters(DefaultConfiguration.singleton(), mode);
267     }
268 
269     private boolean isReport(HttpServletRequest request) {
270         return request.getParameter("report") != null;
271     }
272 
273     private boolean isAnnotated(HttpServletRequest request) {
274         return request.getParameter("annotate") != null;
275     }
276 
277 }