View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdf;
19  
20  import org.apache.any23.extractor.IssueReport;
21  import org.apache.any23.extractor.ExtractionContext;
22  import org.apache.any23.extractor.ExtractionResult;
23  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
24  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
25  import org.eclipse.rdf4j.rio.ParseErrorListener;
26  import org.eclipse.rdf4j.rio.RDFFormat;
27  import org.eclipse.rdf4j.rio.RDFHandlerException;
28  import org.eclipse.rdf4j.rio.RDFParseException;
29  import org.eclipse.rdf4j.rio.RDFParser;
30  import org.eclipse.rdf4j.rio.Rio;
31  import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
32  import org.eclipse.rdf4j.rio.helpers.RDFaParserSettings;
33  import org.eclipse.rdf4j.rio.helpers.RDFaVersion;
34  import org.eclipse.rdf4j.rio.turtle.TurtleParser;
35  import org.semanticweb.owlapi.rio.OWLAPIRDFFormat;
36  import org.slf4j.Logger;
37  import org.slf4j.LoggerFactory;
38  
39  import java.io.IOException;
40  import java.io.InputStream;
41  import java.io.Reader;
42  import java.util.Collections;
43  import java.util.HashSet;
44  
45  /**
46   * This factory provides a common logic for creating and configuring correctly any <i>RDF</i> parser used within the
47   * library.
48   *
49   * @author Michele Mostarda (mostarda@fbk.eu)
50   */
51  public class RDFParserFactory {
52  
53      private static final Logger logger = LoggerFactory.getLogger(RDFParserFactory.class);
54  
55      private static class InstanceHolder {
56          private static final RDFParserFactoryserFactory.html#RDFParserFactory">RDFParserFactory instance = new RDFParserFactory();
57      }
58  
59      public static RDFParserFactory getInstance() {
60          return InstanceHolder.instance;
61      }
62  
63      /**
64       * Returns a new instance of a configured TurtleParser.
65       *
66       * @param verifyDataType
67       *            data verification enable if <code>true</code>.
68       * @param stopAtFirstError
69       *            the parser stops at first error if <code>true</code>.
70       * @param extractionContext
71       *            the extraction context where the parser is used.
72       * @param extractionResult
73       *            the output extraction result.
74       * 
75       * @return a new instance of a configured Turtle parser.
76       */
77      public RDFParser getTurtleParserInstance(final boolean verifyDataType, final boolean stopAtFirstError,
78              final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
79          if (extractionResult == null) {
80              throw new NullPointerException("extractionResult cannot be null.");
81          }
82          final TurtleParser parser = new ExtendedTurtleParser();
83          configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
84          return parser;
85      }
86  
87      /**
88       * Returns a new instance of a configured RDFaParser, set to RDFa-1.0 compatibility mode.
89       *
90       * @param verifyDataType
91       *            data verification enable if <code>true</code>.
92       * @param stopAtFirstError
93       *            the parser stops at first error if <code>true</code>.
94       * @param extractionContext
95       *            the extraction context where the parser is used.
96       * @param extractionResult
97       *            the output extraction result.
98       * 
99       * @return a new instance of a configured RDFXML parser.
100      */
101     public RDFParser getRDFa10Parser(final boolean verifyDataType, final boolean stopAtFirstError,
102             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
103         final RDFParser parser = Rio.createParser(RDFFormat.RDFA);
104         parser.getParserConfig().set(RDFaParserSettings.RDFA_COMPATIBILITY, RDFaVersion.RDFA_1_0);
105         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
106         return parser;
107     }
108 
109     /**
110      * Returns a new instance of a configured RDFaParser, set to RDFa-1.1 compatibility mode.
111      *
112      * @param verifyDataType
113      *            data verification enable if <code>true</code>.
114      * @param stopAtFirstError
115      *            the parser stops at first error if <code>true</code>.
116      * @param extractionContext
117      *            the extraction context where the parser is used.
118      * @param extractionResult
119      *            the output extraction result.
120      * 
121      * @return a new instance of a configured RDFXML parser.
122      */
123     public RDFParser getRDFa11Parser(final boolean verifyDataType, final boolean stopAtFirstError,
124             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
125         final RDFParser parser = Rio.createParser(RDFFormat.RDFA);
126         parser.getParserConfig().set(RDFaParserSettings.RDFA_COMPATIBILITY, RDFaVersion.RDFA_1_1);
127         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
128         return parser;
129     }
130 
131     /**
132      * Returns a new instance of a configured RDFXMLParser.
133      *
134      * @param verifyDataType
135      *            data verification enable if <code>true</code>.
136      * @param stopAtFirstError
137      *            the parser stops at first error if <code>true</code>.
138      * @param extractionContext
139      *            the extraction context where the parser is used.
140      * @param extractionResult
141      *            the output extraction result.
142      * 
143      * @return a new instance of a configured RDFXML parser.
144      */
145     public RDFParser getRDFXMLParser(final boolean verifyDataType, final boolean stopAtFirstError,
146             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
147         final RDFParser parser = Rio.createParser(RDFFormat.RDFXML);
148         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
149         return parser;
150     }
151 
152     /**
153      * Returns a new instance of a configured NTriplesParser.
154      *
155      * @param verifyDataType
156      *            data verification enable if <code>true</code>.
157      * @param stopAtFirstError
158      *            the parser stops at first error if <code>true</code>.
159      * @param extractionContext
160      *            the extraction context where the parser is used.
161      * @param extractionResult
162      *            the output extraction result.
163      * 
164      * @return a new instance of a configured NTriples parser.
165      */
166     public RDFParser getNTriplesParser(final boolean verifyDataType, final boolean stopAtFirstError,
167             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
168         final RDFParser parser = Rio.createParser(RDFFormat.NTRIPLES);
169         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
170         return parser;
171     }
172 
173     /**
174      * Returns a new instance of a configured NQuadsParser.
175      *
176      * @param verifyDataType
177      *            data verification enable if <code>true</code>.
178      * @param stopAtFirstError
179      *            the parser stops at first error if <code>true</code>.
180      * @param extractionContext
181      *            the extraction context where the parser is used.
182      * @param extractionResult
183      *            the output extraction result.
184      * 
185      * @return a new instance of a configured NQuads parser.
186      */
187     public RDFParser getNQuadsParser(final boolean verifyDataType, final boolean stopAtFirstError,
188             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
189         final RDFParser parser = Rio.createParser(RDFFormat.NQUADS);
190         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
191         return parser;
192     }
193 
194     /**
195      * Returns a new instance of a configured ManchesterSyntaxParser.
196      *
197      * @param verifyDataType
198      *            data verification enable if <code>true</code>.
199      * @param stopAtFirstError
200      *            the parser stops at first error if <code>true</code>.
201      * @param extractionContext
202      *            the extraction context where the parser is used.
203      * @param extractionResult
204      *            the output extraction result.
205      * 
206      * @return a new instance of a configured Manchester Syntax parser.
207      */
208     public RDFParser getManchesterSyntaxParser(final boolean verifyDataType, final boolean stopAtFirstError,
209             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
210         final RDFParser parser = Rio.createParser(OWLAPIRDFFormat.MANCHESTER_OWL);
211         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
212         return parser;
213     }
214 
215     /**
216      * Returns a new instance of a configured FunctionalSyntaxParser.
217      *
218      * @param verifyDataType
219      *            data verification enable if <code>true</code>.
220      * @param stopAtFirstError
221      *            the parser stops at first error if <code>true</code>.
222      * @param extractionContext
223      *            the extraction context where the parser is used.
224      * @param extractionResult
225      *            the output extraction result.
226      * 
227      * @return a new instance of a configured Functional Syntax parser.
228      */
229     public RDFParser getFunctionalSyntaxParser(final boolean verifyDataType, final boolean stopAtFirstError,
230             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
231         final RDFParser parser = Rio.createParser(OWLAPIRDFFormat.OWL_FUNCTIONAL);
232         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
233         return parser;
234     }
235 
236     /**
237      * Returns a new instance of a configured TriXParser.
238      *
239      * @param verifyDataType
240      *            data verification enable if <code>true</code>.
241      * @param stopAtFirstError
242      *            the parser stops at first error if <code>true</code>.
243      * @param extractionContext
244      *            the extraction context where the parser is used.
245      * @param extractionResult
246      *            the output extraction result.
247      * 
248      * @return a new instance of a configured TriX parser.
249      */
250     public RDFParser getTriXParser(final boolean verifyDataType, final boolean stopAtFirstError,
251             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
252         final RDFParser parser = Rio.createParser(RDFFormat.TRIX);
253         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
254         return parser;
255     }
256 
257     /**
258      * Returns a new instance of a configured <i>SesameJSONLDParser</i>.
259      * 
260      * @param verifyDataType
261      *            data verification enable if <code>true</code>.
262      * @param stopAtFirstError
263      *            the parser stops at first error if <code>true</code>.
264      * @param extractionContext
265      *            the extraction context where the parser is used.
266      * @param extractionResult
267      *            the output extraction result.
268      * 
269      * @return a new instance of a configured JSONLDParser parser.
270      */
271     public RDFParser getJSONLDParser(final boolean verifyDataType, final boolean stopAtFirstError,
272             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
273         final RDFParser parser = Rio.createParser(RDFFormat.JSONLD);
274         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
275         return parser;
276     }
277 
278     /**
279      * Configures the given parser on the specified extraction result setting the policies for data verification and
280      * error handling.
281      *
282      * @param parser
283      *            the parser to be configured.
284      * @param verifyDataType
285      *            enables the data verification.
286      * @param stopAtFirstError
287      *            enables the tolerant error handling.
288      * @param extractionContext
289      *            the extraction context in which the parser is used.
290      * @param extractionResult
291      *            the extraction result used to collect the parsed data.
292      */
293     // TODO: what about passing just default language and ErrorReport to configureParser() ?
294     private void configureParser(final RDFParser parser, final boolean verifyDataType, final boolean stopAtFirstError,
295             final ExtractionContext extractionContext, final ExtractionResult extractionResult) {
296         parser.getParserConfig().setNonFatalErrors(
297                 stopAtFirstError ? Collections.emptySet() : new HashSet<>(parser.getSupportedSettings()));
298         parser.set(BasicParserSettings.FAIL_ON_UNKNOWN_DATATYPES, verifyDataType);
299         parser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, verifyDataType);
300 
301         parser.setParseErrorListener(new InternalParseErrorListener(extractionResult));
302         parser.setValueFactory(new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance(), extractionResult,
303                 extractionContext.getDefaultLanguage()));
304         parser.setRDFHandler(new RDFHandlerAdapter(extractionResult));
305     }
306 
307     /**
308      * Internal listener used to trace <i>RDF</i> parse errors.
309      */
310     private static class InternalParseErrorListener implements ParseErrorListener {
311 
312         private final IssueReport extractionResult;
313 
314         public InternalParseErrorListener(IssueReport er) {
315             extractionResult = er;
316         }
317 
318         @Override
319         public void warning(String msg, long lineNo, long colNo) {
320             try {
321                 extractionResult.notifyIssue(IssueReport.IssueLevel.WARNING, msg, lineNo, colNo);
322             } catch (Exception e) {
323                 notifyExceptionInNotification(e);
324             }
325         }
326 
327         @Override
328         public void error(String msg, long lineNo, long colNo) {
329             try {
330                 extractionResult.notifyIssue(IssueReport.IssueLevel.ERROR, msg, lineNo, colNo);
331             } catch (Exception e) {
332                 notifyExceptionInNotification(e);
333             }
334         }
335 
336         @Override
337         public void fatalError(String msg, long lineNo, long colNo) {
338             try {
339                 extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, msg, lineNo, colNo);
340             } catch (Exception e) {
341                 notifyExceptionInNotification(e);
342             }
343         }
344 
345         private void notifyExceptionInNotification(Exception e) {
346             if (logger != null) {
347                 logger.error("An exception occurred while notifying an error.", e);
348             }
349         }
350     }
351 
352     /**
353      * This extended Turtle parser sets the default namespace to the base IRI before the parsing.
354      */
355     private static class ExtendedTurtleParser extends TurtleParser {
356         @Override
357         public void parse(Reader reader, String baseIRI) throws IOException, RDFParseException, RDFHandlerException {
358             setNamespace("", baseIRI);
359             super.parse(reader, baseIRI);
360         }
361 
362         @Override
363         public void parse(InputStream in, String baseIRI) throws IOException, RDFParseException, RDFHandlerException {
364             setNamespace("", baseIRI);
365             super.parse(in, baseIRI);
366         }
367     }
368 }