View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdf;
19  
20  import org.apache.any23.extractor.IssueReport;
21  import org.apache.any23.extractor.ExtractionContext;
22  import org.apache.any23.extractor.ExtractionResult;
23  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
24  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
25  import org.eclipse.rdf4j.rio.ParseErrorListener;
26  import org.eclipse.rdf4j.rio.RDFFormat;
27  import org.eclipse.rdf4j.rio.RDFHandlerException;
28  import org.eclipse.rdf4j.rio.RDFParseException;
29  import org.eclipse.rdf4j.rio.RDFParser;
30  import org.eclipse.rdf4j.rio.Rio;
31  import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
32  import org.eclipse.rdf4j.rio.helpers.RDFaParserSettings;
33  import org.eclipse.rdf4j.rio.helpers.RDFaVersion;
34  import org.eclipse.rdf4j.rio.turtle.TurtleParser;
35  import org.semanticweb.owlapi.rio.OWLAPIRDFFormat;
36  import org.slf4j.Logger;
37  import org.slf4j.LoggerFactory;
38  
39  import java.io.IOException;
40  import java.io.InputStream;
41  import java.io.Reader;
42  import java.util.Collections;
43  import java.util.HashSet;
44  
45  /**
46   * This factory provides a common logic for creating and configuring correctly
47   * any <i>RDF</i> parser used within the library.
48   *
49   * @author Michele Mostarda (mostarda@fbk.eu)
50   */
51  public class RDFParserFactory {
52  
53      private static final Logger logger = LoggerFactory.getLogger(RDFParserFactory.class);
54  
55      private static class InstanceHolder {
56          private static final RDFParserFactoryserFactory.html#RDFParserFactory">RDFParserFactory instance = new RDFParserFactory();
57      }
58  
59      public static RDFParserFactory getInstance() {
60          return InstanceHolder.instance;
61      }
62  
63      /**
64       * Returns a new instance of a configured TurtleParser.
65       *
66       * @param verifyDataType data verification enable if <code>true</code>.
67       * @param stopAtFirstError the parser stops at first error if <code>true</code>.
68       * @param extractionContext the extraction context where the parser is used.
69       * @param extractionResult the output extraction result.
70       * @return a new instance of a configured Turtle parser.
71       */
72      public RDFParser getTurtleParserInstance(
73              final boolean verifyDataType,
74              final boolean stopAtFirstError,
75              final ExtractionContext extractionContext,
76              final ExtractionResult extractionResult
77      ) {
78          if (extractionResult == null) {
79              throw new NullPointerException("extractionResult cannot be null.");
80          }
81          final TurtleParser parser = new ExtendedTurtleParser();
82          configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
83          return parser;
84      }
85  
86      /**
87       * Returns a new instance of a configured RDFaParser, set to RDFa-1.0 compatibility mode.
88       *
89       * @param verifyDataType data verification enable if <code>true</code>.
90       * @param stopAtFirstError the parser stops at first error if <code>true</code>.
91       * @param extractionContext the extraction context where the parser is used.
92       * @param extractionResult the output extraction result.
93       * @return a new instance of a configured RDFXML parser.
94       */
95      public RDFParser getRDFa10Parser(
96              final boolean verifyDataType,
97              final boolean stopAtFirstError,
98              final ExtractionContext extractionContext,
99              final ExtractionResult extractionResult
100     ) {
101         final RDFParser parser = Rio.createParser(RDFFormat.RDFA);
102         parser.getParserConfig().set(RDFaParserSettings.RDFA_COMPATIBILITY, RDFaVersion.RDFA_1_0);
103         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
104         return parser;
105     }
106 
107     /**
108      * Returns a new instance of a configured RDFaParser, set to RDFa-1.1 compatibility mode.
109      *
110      * @param verifyDataType data verification enable if <code>true</code>.
111      * @param stopAtFirstError the parser stops at first error if <code>true</code>.
112      * @param extractionContext the extraction context where the parser is used.
113      * @param extractionResult the output extraction result.
114      * @return a new instance of a configured RDFXML parser.
115      */
116     public RDFParser getRDFa11Parser(
117             final boolean verifyDataType,
118             final boolean stopAtFirstError,
119             final ExtractionContext extractionContext,
120             final ExtractionResult extractionResult
121     ) {
122         final RDFParser parser = Rio.createParser(RDFFormat.RDFA);
123         parser.getParserConfig().set(RDFaParserSettings.RDFA_COMPATIBILITY, RDFaVersion.RDFA_1_1);
124         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
125         return parser;
126     }
127 
128     /**
129      * Returns a new instance of a configured RDFXMLParser.
130      *
131      * @param verifyDataType data verification enable if <code>true</code>.
132      * @param stopAtFirstError the parser stops at first error if <code>true</code>.
133      * @param extractionContext the extraction context where the parser is used.
134      * @param extractionResult the output extraction result.
135      * @return a new instance of a configured RDFXML parser.
136      */
137     public RDFParser getRDFXMLParser(
138             final boolean verifyDataType,
139             final boolean stopAtFirstError,
140             final ExtractionContext extractionContext,
141             final ExtractionResult extractionResult
142     ) {
143         final RDFParser parser = Rio.createParser(RDFFormat.RDFXML);
144         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
145         return parser;
146     }
147 
148     /**
149      * Returns a new instance of a configured NTriplesParser.
150      *
151      * @param verifyDataType data verification enable if <code>true</code>.
152      * @param stopAtFirstError the parser stops at first error if <code>true</code>.
153      * @param extractionContext the extraction context where the parser is used.
154      * @param extractionResult the output extraction result.
155      * @return a new instance of a configured NTriples parser.
156      */
157     public RDFParser getNTriplesParser(
158             final boolean verifyDataType,
159             final boolean stopAtFirstError,
160             final ExtractionContext extractionContext,
161             final ExtractionResult extractionResult
162     ) {
163         final RDFParser parser = Rio.createParser(RDFFormat.NTRIPLES);
164         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
165         return parser;
166     }
167 
168     /**
169      * Returns a new instance of a configured NQuadsParser.
170      *
171      * @param verifyDataType data verification enable if <code>true</code>.
172      * @param stopAtFirstError the parser stops at first error if <code>true</code>.
173      * @param extractionContext the extraction context where the parser is used.
174      * @param extractionResult the output extraction result.
175      * @return a new instance of a configured NQuads parser.
176      */
177     public RDFParser getNQuadsParser(
178             final boolean verifyDataType,
179             final boolean stopAtFirstError,
180             final ExtractionContext extractionContext,
181             final ExtractionResult extractionResult
182     ) {
183         final RDFParser parser = Rio.createParser(RDFFormat.NQUADS);
184         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
185         return parser;
186     }
187 
188     /**
189      * Returns a new instance of a configured ManchesterSyntaxParser.
190      *
191      * @param verifyDataType data verification enable if <code>true</code>.
192      * @param stopAtFirstError the parser stops at first error if <code>true</code>.
193      * @param extractionContext the extraction context where the parser is used.
194      * @param extractionResult the output extraction result.
195      * @return a new instance of a configured Manchester Syntax parser.
196      */
197     public RDFParser getManchesterSyntaxParser(
198             final boolean verifyDataType,
199             final boolean stopAtFirstError,
200             final ExtractionContext extractionContext,
201             final ExtractionResult extractionResult
202     ) {
203         final RDFParser parser = Rio.createParser(OWLAPIRDFFormat.MANCHESTER_OWL);
204         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
205         return parser;
206     }
207 
208     /**
209      * Returns a new instance of a configured FunctionalSyntaxParser.
210      *
211      * @param verifyDataType data verification enable if <code>true</code>.
212      * @param stopAtFirstError the parser stops at first error if <code>true</code>.
213      * @param extractionContext the extraction context where the parser is used.
214      * @param extractionResult the output extraction result.
215      * @return a new instance of a configured Functional Syntax parser.
216      */
217     public RDFParser getFunctionalSyntaxParser(
218             final boolean verifyDataType,
219             final boolean stopAtFirstError,
220             final ExtractionContext extractionContext,
221             final ExtractionResult extractionResult
222     ) {
223         final RDFParser parser = Rio.createParser(OWLAPIRDFFormat.OWL_FUNCTIONAL);
224         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
225         return parser;
226     }
227 
228     /**
229      * Returns a new instance of a configured TriXParser.
230      *
231      * @param verifyDataType data verification enable if <code>true</code>.
232      * @param stopAtFirstError the parser stops at first error if <code>true</code>.
233      * @param extractionContext the extraction context where the parser is used.
234      * @param extractionResult the output extraction result.
235      * @return a new instance of a configured TriX parser.
236      */
237     public RDFParser getTriXParser(
238             final boolean verifyDataType,
239             final boolean stopAtFirstError,
240             final ExtractionContext extractionContext,
241             final ExtractionResult extractionResult
242     ) {
243         final RDFParser parser = Rio.createParser(RDFFormat.TRIX);
244         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
245         return parser;
246     }
247     
248     /**
249      * Returns a new instance of a configured <i>SesameJSONLDParser</i>.
250      * @param verifyDataType data verification enable if <code>true</code>.
251      * @param stopAtFirstError the parser stops at first error if <code>true</code>.
252      * @param extractionContext the extraction context where the parser is used.
253      * @param extractionResult the output extraction result.
254      * @return a new instance of a configured JSONLDParser parser.
255      */
256     public RDFParser getJSONLDParser(
257             final boolean verifyDataType,
258             final boolean stopAtFirstError,
259             final ExtractionContext extractionContext,
260             final ExtractionResult extractionResult
261     ) {
262         final RDFParser parser = Rio.createParser(RDFFormat.JSONLD);
263         configureParser(parser, verifyDataType, stopAtFirstError, extractionContext, extractionResult);
264         return parser;
265     }
266 
267     /**
268      * Configures the given parser on the specified extraction result
269      * setting the policies for data verification and error handling.
270      *
271      * @param parser the parser to be configured.
272      * @param verifyDataType enables the data verification.
273      * @param stopAtFirstError enables the tolerant error handling.
274      * @param extractionContext the extraction context in which the parser is used.
275      * @param extractionResult the extraction result used to collect the parsed data.
276      */
277     // TODO: what about passing just default language and ErrorReport to configureParser() ?
278     private void configureParser(
279             final RDFParser parser,
280             final boolean verifyDataType,
281             final boolean stopAtFirstError,
282             final ExtractionContext extractionContext,
283             final ExtractionResult extractionResult
284     ) {
285         parser.getParserConfig().setNonFatalErrors(stopAtFirstError ? Collections.emptySet() : new HashSet<>(parser.getSupportedSettings()));
286         parser.set(BasicParserSettings.FAIL_ON_UNKNOWN_DATATYPES, verifyDataType);
287         parser.set(BasicParserSettings.VERIFY_DATATYPE_VALUES, verifyDataType);
288 
289         parser.setParseErrorListener(new InternalParseErrorListener(extractionResult));
290         parser.setValueFactory(
291                 new Any23ValueFactoryWrapper(
292                         SimpleValueFactory.getInstance(),
293                         extractionResult,
294                         extractionContext.getDefaultLanguage()
295                 )
296         );
297         parser.setRDFHandler(new RDFHandlerAdapter(extractionResult));
298     }
299 
300     /**
301      * Internal listener used to trace <i>RDF</i> parse errors.
302      */
303     private class InternalParseErrorListener implements ParseErrorListener {
304 
305         private final IssueReport extractionResult;
306 
307         public InternalParseErrorListener(IssueReport er) {
308             extractionResult = er;
309         }
310 
311         @Override
312         public void warning(String msg, long lineNo, long colNo) {
313             try {
314                 extractionResult.notifyIssue(IssueReport.IssueLevel.WARNING, msg, lineNo, colNo);
315             } catch (Exception e) {
316                 notifyExceptionInNotification(e);
317             }
318         }
319 
320         @Override
321         public void error(String msg, long lineNo, long colNo) {
322             try {
323                 extractionResult.notifyIssue(IssueReport.IssueLevel.ERROR, msg, lineNo, colNo);
324             } catch (Exception e) {
325                 notifyExceptionInNotification(e);
326             }
327         }
328 
329         @Override
330         public void fatalError(String msg, long lineNo, long colNo) {
331             try {
332                 extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, msg, lineNo, colNo);
333             } catch (Exception e) {
334                 notifyExceptionInNotification(e);
335             }
336         }
337 
338         private void notifyExceptionInNotification(Exception e) {
339             if (logger != null) {
340                 logger.error("An exception occurred while notifying an error.", e);
341             }
342         }
343     }
344 
345     /**
346      * This extended Turtle parser sets the default namespace to the base IRI
347      * before the parsing.
348      */
349     private class ExtendedTurtleParser extends TurtleParser {
350         @Override
351         public void parse(Reader reader, String baseIRI)
352         throws IOException, RDFParseException, RDFHandlerException {
353             setNamespace("", baseIRI);
354             super.parse(reader, baseIRI);
355         }
356 
357         @Override
358         public void parse(InputStream in, String baseIRI)
359         throws IOException, RDFParseException, RDFHandlerException {
360             setNamespace("", baseIRI);
361             super.parse(in, baseIRI);
362         }
363     }
364 }