View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.servlet;
19  
20  import java.io.ByteArrayOutputStream;
21  import java.io.IOException;
22  import java.io.PrintStream;
23  import java.nio.charset.Charset;
24  import java.security.cert.CertificateException;
25  import java.util.ArrayList;
26  import java.util.Collection;
27  import java.util.List;
28  import java.util.stream.Collectors;
29  import javax.servlet.ServletOutputStream;
30  import javax.servlet.http.HttpServletResponse;
31  import org.apache.any23.Any23;
32  import org.apache.any23.ExtractionReport;
33  import org.apache.any23.configuration.Settings;
34  import org.apache.any23.extractor.ExtractionException;
35  import org.apache.any23.extractor.ExtractionParameters;
36  import org.apache.any23.extractor.Extractor;
37  import org.apache.any23.extractor.IssueReport;
38  import org.apache.any23.filter.IgnoreAccidentalRDFa;
39  import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
40  import org.apache.any23.source.DocumentSource;
41  import org.apache.any23.validator.SerializationException;
42  import org.apache.any23.validator.XMLValidationReportSerializer;
43  import org.apache.any23.writer.CompositeTripleHandler;
44  import org.apache.any23.writer.CountingTripleHandler;
45  import org.apache.any23.writer.FormatWriter;
46  import org.apache.any23.writer.TripleWriterFactory;
47  import org.apache.any23.writer.ReportingTripleHandler;
48  import org.apache.any23.writer.TripleHandler;
49  import org.apache.any23.writer.TripleHandlerException;
50  import org.apache.any23.writer.WriterFactory;
51  import org.apache.any23.writer.WriterFactoryRegistry;
52  
53  /**
54   * This class is responsible for building the {@link Servlet}
55   * web response.
56   */
57  class WebResponder {
58  
59      private static final WriterFactoryRegistry writerRegistry = WriterFactoryRegistry.getInstance();
60  
61      /**
62       * Library facade.
63       */
64      private final Any23 runner;
65  
66      /**
67       * Servlet for which building the response.
68       */
69      private Servlet any23servlet;
70  
71      /**
72       * Servlet response object.
73       */
74      private HttpServletResponse response;
75  
76      /**
77       * RDF triple writer.
78       */
79      private TripleHandler rdfWriter = null;
80  
81      /**
82       * Error and statistics reporter.
83       */
84      private ReportingTripleHandler reporter = null;
85  
86      /**
87       * Type of expected output.
88       */
89      private String outputMediaType = null;
90  
91      /**
92       * The output stream.
93       */
94      private ByteArrayOutputStream byteOutStream = new ByteArrayOutputStream();
95  
96      public WebResponder(Servlet any23servlet, HttpServletResponse response) {
97          this.any23servlet = any23servlet;
98          this.response = response;
99          this.runner = new Any23();
100         runner.setHTTPUserAgent("Apache Any23 Servlet http://any23.org/");
101     }
102 
103     protected Any23 getRunner() {
104         return runner;
105     }
106 
107     public void runExtraction(
108             DocumentSource in,
109             ExtractionParameters eps,
110             String format,
111             boolean report, boolean annotate
112     ) throws IOException {
113         if (in == null)
114           return;
115         if (!initRdfWriter(format, report, annotate))
116           return;
117         ExtractionReport er = null;
118         try {
119             er = runner.extract(eps, in, rdfWriter);
120             rdfWriter.close();
121             if (! er.hasMatchingExtractors() ) {
122                 sendError(
123                         415,
124                         "No suitable extractor found for this media type",
125                         null,
126                         er,
127                         report
128                 );
129                 return;
130             }
131         } catch (IOException ioe) {
132             // IO Error.
133             if (ioe.getCause() instanceof CertificateException) {
134                 final String errMsg = "Could not fetch input, IO Error.";
135                 any23servlet.log(errMsg, ioe.getCause());
136                 sendError(502, errMsg, ioe, null, report);
137                 return;
138             }
139             any23servlet.log("Could not fetch input", ioe);
140             sendError(502, "Could not fetch input.", ioe, null, report);
141             return;
142         } catch (ExtractionException e) {
143             if (rdfWriter != null) {
144                 try {
145                     rdfWriter.close();
146                 } catch (TripleHandlerException the) {
147                     throw new RuntimeException("Error while closing TripleHandler", the);
148                 }
149             }
150 
151             // Extraction error. Although there is a critical error we still wish 
152             // to return accurate, partial extraction results to the user
153             String extractionError = "Failed to fully parse input. The extraction result, at the bottom "
154                     + "of this response, if any, will contain extractions only up until the extraction error.";
155             any23servlet.log(extractionError, e);
156             sendError(502, extractionError, e, er, report);
157             return;
158         } catch (Exception e) {
159             any23servlet.log("Internal error", e);
160             sendError(500, "Internal error.", e, null, report);
161             return;
162         }
163 
164         /* *** No triples found. *** */
165         any23servlet.log("Extraction complete, " + reporter.getTotalTriples() + " triples");
166 
167         // Regular response.
168         response.setContentType(outputMediaType);
169         response.setStatus(200);
170         // Set the output encoding equals to the input one.
171         final String charsetEncoding = er.getEncoding();
172         if (Charset.isSupported(charsetEncoding)) {
173             response.setCharacterEncoding(er.getEncoding());
174         } else {
175             response.setCharacterEncoding("UTF-8");
176         }
177 
178         final ServletOutputStream sos = response.getOutputStream();
179         final byte[] data = byteOutStream.toByteArray();
180         if(report) {
181             final PrintStream ps = new PrintStream(sos);
182             try {
183                 printHeader(ps);
184                 printResponse(reporter, er, data, ps);
185             } catch (Exception e) {
186                 throw new RuntimeException("An error occurred while serializing the output response.", e);
187             } finally {
188                 ps.close();
189             }
190         } else {
191             sos.write(data);
192         }
193     }
194 
195     public void sendError(int code, String msg, boolean report) throws IOException {
196         sendError(code, msg, null, null, report);
197     }
198     
199     private void printHeader(PrintStream ps) {
200         ps.println("<?xml version=\"1.0\" encoding=\"UTF-8\" ?>");
201     }
202 
203     private void printResponse(ReportingTripleHandler rth, ExtractionReport er, byte[] data, PrintStream ps) {
204         ps.println("<response>");
205         printExtractors(rth, ps);
206         printReport(null, null, er, ps);
207         printData(data, ps);
208         ps.println("</response>");
209     }
210 
211     private void printExtractors(ReportingTripleHandler rth, PrintStream ps) {
212         ps.println("<extractors>");
213         for (String extractor : rth.getExtractorNames()) {
214             ps.print("<extractor>");
215             ps.print(extractor);
216             ps.println("</extractor>");
217         }
218         ps.println("</extractors>");
219     }
220     
221     private void printIssueReport(ExtractionReport er, PrintStream ps) {
222         ps.println("<issueReport>");
223         for(Extractor<?> extractor : er.getMatchingExtractors()) {
224             final String name = extractor.getDescription().getExtractorName();
225             final Collection<IssueReport.Issue> extractorIssues = er.getExtractorIssues(name);
226             if(extractorIssues.isEmpty())
227                 continue;
228             ps.println( String.format("<extractorIssues extractor=\"%s\">", name));
229             for(IssueReport.Issue issue : er.getExtractorIssues(name)) {
230                 ps.println(
231                         String.format(
232                                 "<issue level=\"%s\" row=\"%d\" col=\"%d\">%s</issue>",
233                                 issue.getLevel().toString(),
234                                 issue.getRow(),
235                                 issue.getCol(),
236                                 issue.getMessage()
237                         )
238                 );
239             }
240             ps.println("</extractorIssues>");
241         }
242         ps.println("</issueReport>");
243 
244     }
245 
246     private void printReport(String msg, Throwable e, ExtractionReport er, PrintStream ps) {
247         XMLValidationReportSerializer#XMLValidationReportSerializer">XMLValidationReportSerializer reportSerializer = new XMLValidationReportSerializer();
248         ps.println("<report>");
249 
250         // Human readable error message.
251         if(msg != null) {
252             ps.printf("<message>%s</message>%n", msg);
253         } else {
254             ps.print("<message/>\n");
255         }
256 
257         // Error stack trace.
258         if(e != null) {
259             ps.println("<error>");
260             ps.println("<![CDATA[");
261             e.printStackTrace(ps);
262             ps.println("]]>");
263             ps.println("</error>");
264         } else {
265             ps.println("<error/>");
266         }
267 
268         // Issue Report.
269         printIssueReport(er, ps);
270 
271         // Validation report.
272         try {
273             reportSerializer.serialize(er.getValidationReport(), ps);
274         } catch (SerializationException se) {
275             ps.println("An error occurred while serializing error.");
276             se.printStackTrace(ps);
277         }
278         ps.println("</report>");
279     }
280 
281     private void printData(byte[] data, PrintStream ps) {
282         ps.println("<data>");
283         ps.println("<![CDATA[");
284         try {
285             ps.write(data);
286         } catch (IOException ioe) {
287             ps.println("An error occurred while serializing data.");
288             ioe.printStackTrace(ps);
289         }
290         ps.println("]]>");
291         ps.println("</data>");
292     }
293 
294     private void sendError(int code, String msg, Exception e, ExtractionReport er, boolean report)
295     throws IOException {
296         response.setStatus(code);
297         response.setContentType("text/plain");
298         final ServletOutputStream sos = response.getOutputStream();
299         final PrintStream ps = new PrintStream(sos);
300         final byte[] data = byteOutStream.toByteArray();
301         if (report) {
302             try {
303                 printHeader(ps);
304                 printReport(msg, e, er, ps);
305             } finally {
306                 ps.close();
307             }
308         } else {
309             ps.println(msg);
310             if (e != null) {
311                 ps.println("================================================================");
312                 e.printStackTrace(ps);
313                 ps.println("================================================================");
314                 printData(data, ps);
315             }
316         }
317     }
318 
319     private boolean initRdfWriter(String format, boolean report, boolean annotate) throws IOException {
320         final WriterFactory factory = getFormatWriter(format);
321         if (!(factory instanceof TripleWriterFactory)) {
322             sendError(
323                     400,
324                     "Invalid format '" + format + "', try one of: "
325                             + writerRegistry.getWriters().stream()
326                             .filter(f -> f instanceof TripleWriterFactory)
327                             .map(WriterFactory::getIdentifier).collect(Collectors.toList()),
328                     null,
329                     null,
330                     report
331             );
332             return false;
333         }
334         TripleHandler fw = ((TripleWriterFactory) factory).getTripleWriter(byteOutStream, Settings.of());
335         if (fw instanceof FormatWriter) {
336             ((FormatWriter)fw).setAnnotated(annotate);
337         }
338         outputMediaType = ((TripleWriterFactory) factory).getTripleFormat().getMimeType();
339         List<TripleHandler> tripleHandlers = new ArrayList<>();
340         tripleHandlers.add(new IgnoreAccidentalRDFa(fw));
341         tripleHandlers.add(new CountingTripleHandler());
342         rdfWriter = new CompositeTripleHandler(tripleHandlers);
343         reporter = new ReportingTripleHandler(rdfWriter);
344         rdfWriter = new IgnoreAccidentalRDFa(
345             new IgnoreTitlesOfEmptyDocuments(reporter),
346             true    // suppress stylesheet triples.
347         );
348         return true;
349     }
350 
351     private WriterFactory getFormatWriter(String format) throws IOException {
352         final String finalFormat;
353         // FIXME: Remove this hardcoded set
354         if ("rdf".equals(format) || "xml".equals(format) || "rdfxml".equals(format)) {
355             finalFormat = "rdfxml";
356         } else if ("turtle".equals(format) || "ttl".equals(format)) {
357             finalFormat = "turtle";
358         } else if ("n3".equals(format)) {
359             finalFormat = "turtle";
360         } else if ("n-triples".equals(format) || "ntriples".equals(format) || "nt".equals(format)) {
361             finalFormat = "ntriples";
362         } else if("nquads".equals(format) || "n-quads".equals(format) || "nq".equals(format)) {
363             finalFormat = "nquads";
364         } else if("trix".equals(format)) {
365             finalFormat = "trix";
366         } else if("json".equals(format)) {
367             finalFormat = "json";
368         } else if("jsonld".equals(format)){
369             finalFormat = "jsonld";        
370         } else {
371             return null;
372         }
373         return writerRegistry.getWriterByIdentifier(finalFormat);
374     }
375 
376 }