1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.servlet;
19
20 import org.apache.any23.configuration.DefaultConfiguration;
21 import org.apache.any23.extractor.ExtractionParameters;
22 import org.apache.any23.http.HTTPClient;
23 import org.apache.any23.servlet.conneg.Any23Negotiator;
24 import org.apache.any23.servlet.conneg.MediaRangeSpec;
25 import org.apache.any23.source.ByteArrayDocumentSource;
26 import org.apache.any23.source.DocumentSource;
27 import org.apache.any23.source.HTTPDocumentSource;
28 import org.apache.any23.source.StringDocumentSource;
29 import org.openrdf.rio.RDFFormat;
30
31 import javax.servlet.ServletException;
32 import javax.servlet.http.HttpServlet;
33 import javax.servlet.http.HttpServletRequest;
34 import javax.servlet.http.HttpServletResponse;
35 import java.io.IOException;
36 import java.net.URI;
37 import java.net.URISyntaxException;
38 import java.util.regex.Pattern;
39
40 import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
41
42
43
44
45
46
47
48
49 public class Servlet extends HttpServlet {
50
51 public static final String DEFAULT_BASE_URI = "http://any23.org/tmp/";
52
53 private static final long serialVersionUID = 8207685628715421336L;
54
55
56 private final static Pattern schemeRegex =
57 Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:");
58
59 @Override
60 protected void doGet(HttpServletRequest req, HttpServletResponse resp) throws IOException, ServletException {
61 final WebResponder responder = new WebResponder(this, resp);
62 final String format = getFormatFromRequestOrNegotiation(req);
63 final boolean report = isReport(req);
64 final boolean annotate = isAnnotated(req);
65 if (format == null) {
66 responder.sendError(406, "Client accept header does not include a supported output format", report);
67 return;
68 }
69 final String uri = getInputURIFromRequest(req);
70 if (uri == null) {
71 responder.sendError(404, "Missing URI in GET request. Try /format/http://example.com/myfile", report);
72 return;
73 }
74 final ExtractionParameters eps = getExtractionParameters(req);
75 responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
76 }
77
78 @Override
79 protected void doPost(HttpServletRequest req, HttpServletResponse resp) throws IOException {
80 final WebResponder responder = new WebResponder(this, resp);
81 final boolean report = isReport(req);
82 final boolean annotate = isAnnotated(req);
83 if (req.getContentType() == null) {
84 responder.sendError(400, "Invalid POST request, no Content-Type for the message body specified", report);
85 return;
86 }
87 final String uri = getInputURIFromRequest(req);
88 final String format = getFormatFromRequestOrNegotiation(req);
89 if (format == null) {
90 responder.sendError(406, "Client accept header does not include a supported output format", report);
91 return;
92 }
93 final ExtractionParameters eps = getExtractionParameters(req);
94 if ("application/x-www-form-urlencoded".equals(getContentTypeHeader(req))) {
95 if (uri != null) {
96 log("Attempting conversion to '" + format + "' from URI <" + uri + ">");
97 responder.runExtraction(createHTTPDocumentSource(responder, uri, report), eps, format, report, annotate);
98 return;
99 }
100 if (req.getParameter("body") == null) {
101 responder.sendError(400, "Invalid POST request, parameter 'uri' or 'body' required", report);
102 return;
103 }
104 String type = null;
105 if (req.getParameter("type") != null && !"".equals(req.getParameter("type"))) {
106 type = req.getParameter("type");
107 }
108 log("Attempting conversion to '" + format + "' from body parameter");
109 responder.runExtraction(
110 new StringDocumentSource(req.getParameter("body"), Servlet.DEFAULT_BASE_URI, type),
111 eps,
112 format,
113 report, annotate
114 );
115 return;
116 }
117 log("Attempting conversion to '" + format + "' from POST body");
118 responder.runExtraction(
119 new ByteArrayDocumentSource(
120 req.getInputStream(),
121 Servlet.DEFAULT_BASE_URI,
122 getContentTypeHeader(req)
123 ),
124 eps,
125 format,
126 report, annotate
127 );
128 }
129
130 private String getFormatFromRequestOrNegotiation(HttpServletRequest request) {
131 String fromRequest = getFormatFromRequest(request);
132 if (fromRequest != null && !"".equals(fromRequest) && !"best".equals(fromRequest)) {
133 return fromRequest;
134 }
135 MediaRangeSpec result = Any23Negotiator.getNegotiator().getBestMatch(request.getHeader("Accept"));
136 if (result == null) {
137 return null;
138 }
139 else if (RDFFormat.TURTLE.hasMIMEType(result.getMediaType())) {
140 return "turtle";
141 }
142 else if (RDFFormat.N3.hasMIMEType(result.getMediaType())) {
143 return "n3";
144 }
145 else if (RDFFormat.NQUADS.hasMIMEType(result.getMediaType())) {
146 return "nq";
147 }
148 else if (RDFFormat.RDFXML.hasMIMEType(result.getMediaType())) {
149 return "rdf";
150 }
151 else if (RDFFormat.NTRIPLES.hasMIMEType(result.getMediaType())) {
152 return "nt";
153 }
154 else {
155 return "turtle";
156 }
157 }
158
159 private String getFormatFromRequest(HttpServletRequest request) {
160 if (request.getPathInfo() == null) return "best";
161 String[] args = request.getPathInfo().split("/", 3);
162 if (args.length < 2 || "".equals(args[1])) {
163 if (request.getParameter("format") == null) {
164 return "best";
165 } else {
166 return request.getParameter("format");
167 }
168 }
169 return args[1];
170 }
171
172 private String getInputURIFromRequest(HttpServletRequest request) {
173 if (request.getPathInfo() == null) return null;
174 String[] args = request.getPathInfo().split("/", 3);
175 if (args.length < 3) {
176 if (request.getParameter("uri") != null) {
177 return request.getParameter("uri").trim();
178 }
179 if (request.getParameter("url") != null) {
180 return request.getParameter("url").trim();
181 }
182 return null;
183 }
184 String uri = args[2];
185 if (request.getQueryString() != null) {
186 uri = uri + "?" + request.getQueryString();
187 }
188 if (!hasScheme(uri)) {
189 uri = "http://" + uri;
190 } else if (hasOnlySingleSlashAfterScheme(uri)) {
191
192
193
194
195 uri = uri.replaceFirst(":/", "://");
196 }
197 return uri.trim();
198 }
199
200
201 private boolean hasScheme(String uri) {
202 return schemeRegex.matcher(uri).find();
203 }
204
205 private final static Pattern schemeAndSingleSlashRegex =
206 Pattern.compile("^[a-zA-Z][a-zA-Z0-9.+-]*:/[^/]");
207
208 private boolean hasOnlySingleSlashAfterScheme(String uri) {
209 return schemeAndSingleSlashRegex.matcher(uri).find();
210 }
211
212 private String getContentTypeHeader(HttpServletRequest req) {
213 if (req.getHeader("Content-Type") == null) return null;
214 if ("".equals(req.getHeader("Content-Type"))) return null;
215 String contentType = req.getHeader("Content-Type");
216
217 int index = contentType.indexOf(";");
218 if (index == -1) return contentType;
219 return contentType.substring(0, index);
220 }
221
222 private DocumentSource createHTTPDocumentSource(WebResponder responder, String uri, boolean report)
223 throws IOException {
224 try {
225 if (!isValidURI(uri)) {
226 throw new URISyntaxException(uri, "@@@");
227 }
228 return createHTTPDocumentSource(responder.getRunner().getHTTPClient(), uri);
229 } catch (URISyntaxException ex) {
230 responder.sendError(400, "Invalid input URI " + uri, report);
231 return null;
232 }
233 }
234
235 protected DocumentSource createHTTPDocumentSource(HTTPClient httpClient, String uri)
236 throws IOException, URISyntaxException {
237 return new HTTPDocumentSource(httpClient, uri);
238 }
239
240 private boolean isValidURI(String s) {
241 try {
242 URI uri = new URI(s);
243 if (!"http".equals(uri.getScheme()) && !"https".equals(uri.getScheme())) {
244 return false;
245 }
246 } catch (URISyntaxException e) {
247 return false;
248 }
249 return true;
250 }
251
252 private ValidationMode getValidationMode(HttpServletRequest request) {
253 final String PARAMETER = "validation-mode";
254 final String validationMode = request.getParameter(PARAMETER);
255 if(validationMode == null) return ValidationMode.None;
256 if("none".equalsIgnoreCase(validationMode)) return ValidationMode.None;
257 if("validate".equalsIgnoreCase(validationMode)) return ValidationMode.Validate;
258 if("validate-fix".equalsIgnoreCase(validationMode)) return ValidationMode.ValidateAndFix;
259 throw new IllegalArgumentException(
260 String.format("Invalid value '%s' for '%s' parameter.", validationMode, PARAMETER)
261 );
262 }
263
264 private ExtractionParameters getExtractionParameters(HttpServletRequest request) {
265 final ValidationMode mode = getValidationMode(request);
266 return new ExtractionParameters(DefaultConfiguration.singleton(), mode);
267 }
268
269 private boolean isReport(HttpServletRequest request) {
270 return request.getParameter("report") != null;
271 }
272
273 private boolean isAnnotated(HttpServletRequest request) {
274 return request.getParameter("annotate") != null;
275 }
276
277 }