View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.servlet;
19  
20  import org.apache.any23.configuration.DefaultConfiguration;
21  import org.apache.any23.extractor.ExtractionParameters;
22  import org.apache.any23.extractor.ExtractorRegistry;
23  import org.apache.any23.extractor.ExtractorRegistryImpl;
24  import org.apache.any23.http.HTTPClient;
25  import org.apache.any23.plugin.Any23PluginManager;
26  import org.apache.any23.servlet.conneg.Any23Negotiator;
27  import org.apache.any23.servlet.conneg.MediaRangeSpec;
28  import org.apache.any23.source.ByteArrayDocumentSource;
29  import org.apache.any23.source.DocumentSource;
30  import org.apache.any23.source.HTTPDocumentSource;
31  import org.apache.any23.source.StringDocumentSource;
32  import org.eclipse.rdf4j.rio.RDFFormat;
33  import org.slf4j.Logger;
34  import org.slf4j.LoggerFactory;
35  
36  import javax.servlet.ServletException;
37  import javax.servlet.http.HttpServlet;
38  import javax.servlet.http.HttpServletRequest;
39  import javax.servlet.http.HttpServletResponse;
40  
41  import java.io.File;
42  import java.io.IOException;
43  import java.net.URI;
44  import java.net.URISyntaxException;
45  import java.util.regex.Pattern;
46  
47  import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
48  
49  /**
50   * A <i>Servlet</i> that fetches a client-specified <i>IRI</i>,
51   * RDFizes the content, and returns it in a format chosen by the client.
52   *
53   * @author Gabriele Renzi
54   * @author Richard Cyganiak (richard@cyganiak.de)
55   */
56  public class Servlet extends HttpServlet {
57  
58      private static final Logger LOG = LoggerFactory.getLogger(Servlet.class);
59  
60      public static final String DEFAULT_BASE_IRI = "http://any23.org/tmp/";
61  
62      private static final long serialVersionUID = 8207685628715421336L;
63  
64      private static final Pattern schemeAndSingleSlashRegex =
65              Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:/[^/]");
66  
67      // RFC 3986: scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
68      private static final Pattern schemeRegex =
69              Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:");
70  
71      @Override
72      protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws IOException, ServletException {
73          final WebResponder.html#WebResponder">WebResponder responder = new WebResponder(this, resp);
74          final String format = getFormatFromRequestOrNegotiation(req);
75          final boolean report = isReport(req);
76          final boolean annotate = isAnnotated(req);
77          final boolean openie = isOpenIE(req);
78          if (format == null) {
79              try {
80                  responder.sendError(406, "Client accept header does not include a supported output format", report);
81                  return;
82              } catch (IOException e) {
83                  LOG.error("Unable to send error for null request format.", e);
84              }
85          }
86          final String uri = getInputIRIFromRequest(req);
87          if (uri == null) {
88              try {
89                  responder.sendError(404, "Missing IRI in GET request. Try /format/http://example.com/myfile", report);
90                  return;
91              } catch (Exception e) {
92                  LOG.error("Unable to send error for null request IRI.", e);
93              }
94          }
95          if (openie) {
96              Any23PluginManager pManager = Any23PluginManager.getInstance();
97              //Dynamically adding Jar's to the Classpath via the following logic
98              //is absolutely dependant on the 'apache-any23-openie' directory being
99              //present within the webapp /lib directory. This is specified within 
100             //the maven-dependency-plugin.
101             File webappClasspath = new File(getClass().getClassLoader().getResource("").getPath());
102             File openIEJarPath = new File(webappClasspath.getParentFile().getPath() + "/lib/apache-any23-openie");
103             boolean loadedJars = pManager.loadJARDir(openIEJarPath);
104             if (loadedJars) {
105                 ExtractorRegistry r = ExtractorRegistryImpl.getInstance();
106                 try {
107                     pManager.getExtractors().forEachRemaining(r::register);
108                 } catch (IOException e) {
109                     LOG.error("Error during dynamic classloading of JARs from OpenIE runtime directory {}", openIEJarPath.toString(), e);
110                 }
111                 LOG.info("Successful dynamic classloading of JARs from OpenIE runtime directory {}", openIEJarPath.toString());
112             }
113         }
114         final ExtractionParameters eps = getExtractionParameters(req);
115         try {
116             responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
117         } catch (IOException e) {
118             LOG.error("Unable to run extraction on HTTPDocumentSource.", e);
119         }
120     }
121 
122     @Override
123     protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws IOException {
124         final WebResponder.html#WebResponder">WebResponder responder = new WebResponder(this, resp);
125         final boolean report = isReport(req);
126         final boolean annotate = isAnnotated(req);
127         final boolean openie = isOpenIE(req);
128         if (req.getContentType() == null) {
129             responder.sendError(400, "Invalid POST request, no Content-Type for the message body specified", report);
130             return;
131         }
132         final String uri = getInputIRIFromRequest(req);
133         final String format = getFormatFromRequestOrNegotiation(req);
134         if (format == null) {
135             responder.sendError(406, "Client accept header does not include a supported output format", report);
136             return;
137         }
138         if (openie) {
139           Any23PluginManager pManager = Any23PluginManager.getInstance();
140           pManager.loadJARDir(new File(getClass().getResource("apache-any23-openie").getPath()));
141         }
142         final ExtractionParameters eps = getExtractionParameters(req);
143         if ("application/x-www-form-urlencoded".equals(getContentTypeHeader(req))) {
144             if (uri != null) {
145                 log("Attempting conversion to '" + format + "' from IRI <" + uri + ">");
146                 responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
147                 return;
148             }
149             if (req.getParameter("body") == null) {
150                 responder.sendError(400, "Invalid POST request, parameter 'uri' or 'body' required", report);
151                 return;
152             }
153             String type = null;
154             if (req.getParameter("type") != null && !"".equals(req.getParameter("type"))) {
155                 type = req.getParameter("type");
156             }
157             log("Attempting conversion to '" + format + "' from body parameter");
158             responder.runExtraction(
159                     new StringDocumentSource(req.getParameter("body"), Servlet.DEFAULT_BASE_IRI, type),
160                     eps,
161                     format,
162                     report, annotate
163             );
164             return;
165         }
166         log("Attempting conversion to '" + format + "' from POST body");
167         responder.runExtraction(
168                 new ByteArrayDocumentSource(
169                         req.getInputStream(),
170                         Servlet.DEFAULT_BASE_IRI,
171                         getContentTypeHeader(req)
172                 ),
173                 eps,
174                 format,
175                 report, annotate
176         );
177     }
178 
179     private String getFormatFromRequestOrNegotiation(HttpServletRequest request) {
180         String fromRequest = getFormatFromRequest(request);
181         if (fromRequest != null && !"".equals(fromRequest) && !"best".equals(fromRequest)) {
182             return fromRequest;
183         }
184         MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept"));
185         if (result == null) {
186             return null;
187         } else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) {
188             return "n3";
189         } else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) {
190             return "nq";
191         } else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) {
192             return "rdf";
193         } else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) {
194             return "nt";
195         } else if (RDFFormat.JSONLD.hasMIMEType(result.getMediaType())) {
196             return "ld+json";
197         } else {
198             return "turtle"; // shouldn't happen however default is turtle
199         }
200     }
201 
202     private String getFormatFromRequest(HttpServletRequest request) {
203         if (request.getPathInfo() == null)
204             return "best";
205         String[] args = request.getPathInfo().split("/", 3);
206         if (args.length < 2 || "".equals(args[1])) {
207             if (request.getParameter("format") == null) {
208                 return "best";
209             } else {
210                 return request.getParameter("format");
211             }
212         }
213         return args[1];
214     }
215 
216     private String getInputIRIFromRequest(HttpServletRequest request) {
217         if (request.getPathInfo() == null)
218             return null;
219         String[] args = request.getPathInfo().split("/", 3);
220         if (args.length < 3) {
221             if (request.getParameter("uri") != null) {
222                 return request.getParameter("uri").trim();
223             }
224             if (request.getParameter("url") != null) {
225                 return request.getParameter("url").trim();
226             }
227             return null;
228         }
229         String uri = args[2];
230         if (request.getQueryString() != null) {
231             uri = uri + "?" + request.getQueryString();
232         }
233         if (!hasScheme(uri)) {
234             uri = "http://" + uri;
235         } else if (hasOnlySingleSlashAfterScheme(uri)) {
236             // This is to work around an issue where Tomcat 6.0.18 is
237             // too smart for us. Tomcat normalizes double-slashes in
238             // the path, and thus turns "http://" into "http:/" if it
239             // occurs in the path. So we restore the double slash.
240             uri = uri.replaceFirst(":/", "://");
241         }
242         return uri.trim();
243     }
244 
245 
246     private boolean hasScheme(String uri) {
247         return schemeRegex.matcher(uri).find();
248     }
249 
250     private boolean hasOnlySingleSlashAfterScheme(String uri) {
251         return schemeAndSingleSlashRegex.matcher(uri).find();
252     }
253 
254     private String getContentTypeHeader(HttpServletRequest req) {
255         String cType = "Content-Type";
256         if (req.getHeader(cType) == null)
257             return null;
258         if ("".equals(req.getHeader(cType)))
259             return null;
260         String contentType = req.getHeader(cType);
261         // strip off parameters such as ";charset=UTF-8"
262         int index = contentType.indexOf(';');
263         if (index == -1)
264             return contentType;
265         return contentType.substring(0, index);
266     }
267 
268     private DocumentSource createHTTPDocumentSource(WebResponder responder, String uri, boolean report)
269             throws IOException {
270         try {
271             if (!isValidIRI(uri)) {
272                 throw new URISyntaxException(uri, "@@@");
273             }
274             return createHTTPDocumentSource(responder.getRunner().getHTTPClient(), uri);
275         } catch (URISyntaxException ex) {
276             LOG.error("Invalid IRI detected", ex);
277             responder.sendError(400, "Invalid input IRI " + uri, report);
278             return null;
279         }
280     }
281 
282     protected DocumentSource createHTTPDocumentSource(HTTPClient httpClient, String uri)
283             throws IOException, URISyntaxException {
284         return new HTTPDocumentSource(httpClient, uri);
285     }
286 
287     private boolean isValidIRI(String s) {
288         try {
289             URI uri = new URI(s);
290             if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) {
291                 return false;
292             }
293         } catch (Exception e) {
294             return false;
295         }
296         return true;
297     }
298 
299     private ValidationMode getValidationMode(HttpServletRequest request) {
300         final String parameter = "validation-mode";
301         final String validationMode = request.getParameter(parameter);
302         if (validationMode == null)
303             return ValidationMode.NONE;
304         if ("none".equalsIgnoreCase(validationMode))
305             return ValidationMode.NONE;
306         if ("validate".equalsIgnoreCase(validationMode))
307             return ValidationMode.VALIDATE;
308         if ("validate-fix".equalsIgnoreCase(validationMode))
309             return ValidationMode.VALIDATE_AND_FIX;
310         throw new IllegalArgumentException(
311                 String.format("Invalid value '%s' for '%s' parameter.", validationMode, parameter)
312         );
313     }
314 
315     private ExtractionParameters getExtractionParameters(HttpServletRequest request) {
316         final ValidationMode mode = getValidationMode(request);
317         return new ExtractionParameters(DefaultConfiguration.singleton(), mode);
318     }
319 
320     private boolean isReport(HttpServletRequest request) {
321         return request.getParameter("report") != null;
322     }
323 
324     private boolean isAnnotated(HttpServletRequest request) {
325         return request.getParameter("annotate") != null;
326     }
327 
328     private boolean isOpenIE(HttpServletRequest request) {
329       return request.getParameter("openie") != null;
330   }
331 
332 }