View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.rdf;
19  
20  import org.apache.any23.util.MathUtils;
21  import org.apache.any23.util.StringUtils;
22  import org.eclipse.rdf4j.model.BNode;
23  import org.eclipse.rdf4j.model.IRI;
24  import org.eclipse.rdf4j.model.Literal;
25  import org.eclipse.rdf4j.model.Resource;
26  import org.eclipse.rdf4j.model.Statement;
27  import org.eclipse.rdf4j.model.Value;
28  import org.eclipse.rdf4j.model.ValueFactory;
29  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
30  import org.eclipse.rdf4j.model.vocabulary.RDF;
31  import org.eclipse.rdf4j.rio.RDFFormat;
32  import org.eclipse.rdf4j.rio.RDFParser;
33  import org.eclipse.rdf4j.rio.RDFParserRegistry;
34  import org.eclipse.rdf4j.rio.RDFWriter;
35  import org.eclipse.rdf4j.rio.Rio;
36  import org.eclipse.rdf4j.rio.helpers.BasicParserSettings;
37  import org.eclipse.rdf4j.rio.helpers.StatementCollector;
38  import org.slf4j.Logger;
39  import org.slf4j.LoggerFactory;
40  
41  import javax.xml.datatype.DatatypeConfigurationException;
42  import javax.xml.datatype.DatatypeFactory;
43  import javax.xml.datatype.XMLGregorianCalendar;
44  import java.io.ByteArrayInputStream;
45  import java.io.IOException;
46  import java.io.InputStream;
47  import java.io.OutputStream;
48  import java.io.Writer;
49  import java.net.URISyntaxException;
50  import java.text.ParseException;
51  import java.text.SimpleDateFormat;
52  import java.util.Collection;
53  import java.util.Date;
54  import java.util.GregorianCalendar;
55  import java.util.Optional;
56  
57  /**
58   * Basic class providing a set of utility methods when dealing with <i>RDF</i>.
59   *
60   * @author Michele Mostarda (mostarda@fbk.eu)
61   * @author Davide Palmisano (dpalmisano@gmail.com)
62   * @author Jacek Grzebyta (jgrzebyta@apache.org)
63   */
64  public class RDFUtils {
65  
66      private static int nodeId = 0;
67  
68      private static final ValueFactory valueFactory = SimpleValueFactory.getInstance();
69  
70      private static final Logger LOG = LoggerFactory.getLogger(RDFUtils.class);
71  
72      private static final Statement[] EMPTY_STATEMENTS = new Statement[0];
73  
74      private RDFUtils() {}
75  
76      /**
77       * Fixes typical errors in an absolute org.eclipse.rdf4j.model.IRI, such as unescaped spaces.
78       *
79       * @param uri An absolute org.eclipse.rdf4j.model.IRI, can have typical syntax errors
80       * @return An absolute org.eclipse.rdf4j.model.IRI that is valid against the org.eclipse.rdf4j.model.IRI syntax
81       * @throws IllegalArgumentException if org.eclipse.rdf4j.model.IRI is not fixable
82       */
83      public static String fixAbsoluteIRI(String uri) {
84          String fixed = fixIRIWithException(uri);
85          if (!fixed.matches("[a-zA-Z0-9]+:/.*"))
86              throw new IllegalArgumentException("not a absolute org.eclipse.rdf4j.model.IRI: " + uri);
87          // Add trailing slash if org.eclipse.rdf4j.model.IRI has only authority but no path.
88          if (fixed.matches("https?://[a-zA-Z0-9.-]+(:[0-9+])?")) {
89              fixed = fixed + "/";
90          }
91          return fixed;
92      }
93  
94      /**
95       * This method allows to obtain an <a href="http://www.w3.org/TR/xmlschema-2/#date">XML Schema</a> compliant date
96       * providing a textual representation of a date and textual a pattern for parsing it.
97       *
98       * @param dateToBeParsed the String containing the date.
99       * @param format the pattern as descibed in {@link java.text.SimpleDateFormat}
100      * @return a {@link String} representing the date
101      * @throws java.text.ParseException if there is an error parsing the given date.
102      * @throws javax.xml.datatype.DatatypeConfigurationException if there is a serious
103      * configuration error.
104      */
105     public static String getXSDDate(String dateToBeParsed, String format)
106     throws ParseException, DatatypeConfigurationException {
107         SimpleDateFormat simpleDateFormat = new SimpleDateFormat(format);
108         Date date = simpleDateFormat.parse(dateToBeParsed);
109         GregorianCalendar gc = new GregorianCalendar();
110         gc.setTime(date);
111         XMLGregorianCalendar xml = DatatypeFactory.newInstance().newXMLGregorianCalendar(gc);
112         xml.setTimezone(0);
113         return xml.toString();
114     }
115 
116     /**
117      * Prints a <code>date</code> to the XSD datetime format.
118      *
119      * @param date date to be printed.
120      * @return the string representation of the input date.
121      */
122     public static String toXSDDateTime(Date date) {
123         SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssZ");
124         String s = simpleDateFormat.format(date);
125         StringBuilder sb = new StringBuilder(s);
126         sb.insert(22, ':');
127         return sb.toString();
128     }
129 
130     /**
131      * <p>Tries to fix a potentially broken relative or absolute URI.</p>
132      * These appear to be good rules:
133      * Remove whitespace or '\' or '"' in beginning and end
134      * Replace space with %20
135      * Drop the triple if it matches this regex (only protocol): ^[a-zA-Z0-9]+:(//)?$
136      * Drop the triple if it matches this regex: ^javascript:
137      * Truncate "&gt;.*$ from end of lines (Neko didn't quite manage to fix broken markup)
138      * Drop the triple if any of these appear in the URL: &lt;&gt;[]|*{}"&lt;&gt;\
139      *
140      * @param unescapedIRI uri string to be unescaped.
141      * @return the unescaped string.
142      */
143     public static String fixIRIWithException(String unescapedIRI) {
144         if (unescapedIRI == null)
145             throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI was null");
146 
147         //    Remove starting and ending whitespace
148         String escapedIRI = unescapedIRI.trim();
149 
150         //Replace space with %20
151         escapedIRI = escapedIRI.replaceAll(" ", "%20");
152 
153         //strip linebreaks
154         escapedIRI = escapedIRI.replaceAll("\n", "");
155 
156         //'Remove starting  "\" or '"'
157         if (escapedIRI.startsWith("\\") || escapedIRI.startsWith("\""))
158             escapedIRI = escapedIRI.substring(1);
159         //Remove  ending   "\" or '"'
160         if (escapedIRI.endsWith("\\") || escapedIRI.endsWith("\""))
161             escapedIRI = escapedIRI.substring(0, escapedIRI.length() - 1);
162 
163         //Drop the triple if it matches this regex (only protocol): ^[a-zA-Z0-9]+:/?/?$
164         if (escapedIRI.matches("^[a-zA-Z0-9]+:/?/?$"))
165             throw new IllegalArgumentException("no authority in org.eclipse.rdf4j.model.IRI: " + unescapedIRI);
166 
167         //Drop the triple if it matches this regex: ^javascript:
168         if (escapedIRI.matches("^javascript:"))
169             throw new IllegalArgumentException("org.eclipse.rdf4j.model.IRI starts with javascript: " + unescapedIRI);
170 
171         // stripHTML
172         // escapedIRI = escapedIRI.replaceAll("\\<.*?\\>", "");
173 
174         //>.*$ from end of lines (Neko didn't quite manage to fix broken markup)
175         escapedIRI = escapedIRI.replaceAll(">.*$", "");
176 
177         //Drop the triple if any of these appear in the URL: <>[]|*{}"<>\
178         if (escapedIRI.matches("[<>\\[\\]|\\*\\{\\}\"\\\\]"))
179             throw new IllegalArgumentException("Invalid character in org.eclipse.rdf4j.model.IRI: " + unescapedIRI);
180 
181         return escapedIRI;
182     }
183 
184     /**
185      * Creates a {@link org.eclipse.rdf4j.model.IRI}.
186      * @param iri a base string for the {@link org.eclipse.rdf4j.model.IRI}
187      * @return a valid {@link org.eclipse.rdf4j.model.IRI}
188      */
189     public static org.eclipse.rdf4j.model.IRI iri(String iri) {
190         return valueFactory.createIRI(iri);
191     }
192 
193     /**
194      * Creates a {@link org.eclipse.rdf4j.model.IRI}.
195      * @param namespace a base namespace for the {@link org.eclipse.rdf4j.model.IRI}
196      * @param localName a local name to associate with the namespace
197      * @return a valid {@link org.eclipse.rdf4j.model.IRI}
198      */
199     public static org.eclipse.rdf4j.model.IRI iri(String namespace, String localName) {
200         return valueFactory.createIRI(namespace, localName);
201     }
202 
203     /**
204      * Creates a {@link Literal}.
205      * @param s string representation of the {@link org.eclipse.rdf4j.model.Literal}
206      * @return valid {@link org.eclipse.rdf4j.model.Literal}
207      */
208     public static Literal literal(String s) {
209         return valueFactory.createLiteral(s);
210     }
211 
212     /**
213      * Creates a {@link Literal}.
214      * @param b boolean representation of the {@link org.eclipse.rdf4j.model.Literal}
215      * @return valid {@link org.eclipse.rdf4j.model.Literal}
216      */
217     public static Literal literal(boolean b) {
218         return valueFactory.createLiteral(b);
219     }
220 
221     /**
222      * Creates a {@link Literal}.
223      * @param b byte representation of the {@link org.eclipse.rdf4j.model.Literal}
224      * @return valid {@link org.eclipse.rdf4j.model.Literal}
225      */
226     public static Literal literal(byte b) {
227         return valueFactory.createLiteral(b);
228     }
229 
230     /**
231      * Creates a {@link Literal}.
232      * @param s short representation of the {@link org.eclipse.rdf4j.model.Literal}
233      * @return valid {@link org.eclipse.rdf4j.model.Literal}
234      */
235     public static Literal literal(short s) {
236         return valueFactory.createLiteral(s);
237     }
238 
239     /**
240      * Creates a {@link Literal}.
241      * @param i int representation of the {@link org.eclipse.rdf4j.model.Literal}
242      * @return valid {@link org.eclipse.rdf4j.model.Literal}
243      */
244     public static Literal literal(int i) {
245         return valueFactory.createLiteral(i);
246     }
247 
248     /**
249      * Creates a {@link Literal}.
250      * @param l long representation of the {@link org.eclipse.rdf4j.model.Literal}
251      * @return valid {@link org.eclipse.rdf4j.model.Literal}
252      */
253     public static Literal literal(long l) {
254         return valueFactory.createLiteral(l);
255     }
256 
257     /**
258      * Creates a {@link Literal}.
259      * @param f float representation of the {@link org.eclipse.rdf4j.model.Literal}
260      * @return valid {@link org.eclipse.rdf4j.model.Literal}
261      */
262     public static Literal literal(float f) {
263         return valueFactory.createLiteral(f);
264     }
265 
266     /**
267      * Creates a {@link Literal}.
268      * @param d double representation of the {@link org.eclipse.rdf4j.model.Literal}
269      * @return valid {@link org.eclipse.rdf4j.model.Literal}
270      */
271     public static Literal literal(double d) {
272         return valueFactory.createLiteral(d);
273     }
274 
275     /**
276      * Creates a {@link Literal}.
277      * @param s the literal's label
278      * @param l the literal's language
279      * @return valid {@link org.eclipse.rdf4j.model.Literal}
280      */
281     public static Literal literal(String s, String l) {
282         if(l == null) {
283             // HACK: Workaround for ANY23 code that passes null in for language tag
284             return valueFactory.createLiteral(s);
285         } else {
286             return valueFactory.createLiteral(s, l);
287         }
288     }
289 
290     /**
291      * Creates a {@link Literal}.
292      * @param s the literal's label
293      * @param datatype the literal's datatype
294      * @return valid {@link org.eclipse.rdf4j.model.Literal}
295      */
296     public static Literal literal(String s, org.eclipse.rdf4j.model.IRI datatype) {
297         return valueFactory.createLiteral(s, datatype);
298     }
299 
300     /**
301      * Creates a {@link BNode}.
302      * @param id string representation of the {@link org.eclipse.rdf4j.model.BNode}
303      * @return the valid {@link org.eclipse.rdf4j.model.BNode}
304      */
305     // TODO: replace this with all occurrences of #getBNode()
306     public static BNode bnode(String id) {
307         return valueFactory.createBNode(id);
308     }
309 
310     /**
311      * @return a <code>bnode</code> with unique id.
312      */
313     public static BNode bnode() {
314         return valueFactory.createBNode();
315     }
316 
317     /**
318      * Creates a {@link BNode}.
319      * @param id string representation of the {@link org.eclipse.rdf4j.model.BNode}
320      * name for which we will create a md5 hash.
321      * @return the valid {@link org.eclipse.rdf4j.model.BNode} 
322      */
323     public static BNode getBNode(String id) {
324         return valueFactory.createBNode(
325             "node" + MathUtils.md5(id)
326         );
327     }
328 
329     /**
330      * Creates a {@link Statement}.
331      * @param s subject {@link org.eclipse.rdf4j.model.Resource}
332      * @param p predicate {@link org.eclipse.rdf4j.model.URI}
333      * @param o object {@link org.eclipse.rdf4j.model.Value}
334      * @return valid {@link org.eclipse.rdf4j.model.Statement}
335      */
336     public static Statement triple(Resource s, org.eclipse.rdf4j.model.IRI p, Value o) {
337         return valueFactory.createStatement(s, p, o);
338     }
339 
340     /**
341      * Creates a statement of type: <code>toValue(s), toValue(p), toValue(o)</code>
342      *
343      * @param s subject.
344      * @param p predicate.
345      * @param o object.
346      * @return a statement instance.
347      */
348     public static Statement triple(String s, String p, String o) {
349         return valueFactory.createStatement((Resource) toValue(s), (org.eclipse.rdf4j.model.IRI) toValue(p), toValue(o));
350     }
351 
352     /**
353      * Creates a {@link Statement}.
354      * @param s subject.
355      * @param p predicate.
356      * @param o object.
357      * @param g quad resource
358      * @return a statement instance.
359      */
360     public static Statement quad(Resource s, org.eclipse.rdf4j.model.IRI p, Value o, Resource g) {
361         return valueFactory.createStatement(s, p, o, g);
362     }
363 
364     /**
365      * Creates a statement of type: <code>toValue(s), toValue(p), toValue(o), toValue(g)</code>
366      * @param s subject.
367      * @param p predicate.
368      * @param o object.
369      * @param g quad resource
370      * @return a statement instance.
371      */
372     public static Statement quad(String s, String p, String o, String g) {
373         return valueFactory.createStatement((Resource) toValue(s), (org.eclipse.rdf4j.model.IRI) toValue(p), toValue(o), (Resource) toValue(g));
374     }
375 
376     /**
377      * Creates a {@link Value}. If <code>s == 'a'</code> returns
378      * an {@link RDF#TYPE}. If <code> s.matches('[a-z0-9]+:.*')</code>
379      * expands the corresponding prefix using {@link PopularPrefixes}.
380      *
381      * @param s string representation of value.
382      * @return a value instance.
383      */
384     public static Value toValue(String s) {
385         if ("a".equals(s))
386             return RDF.TYPE;
387         if (s.matches("[a-z0-9]+:.*")) {
388             return PopularPrefixes.get().expand(s);
389         }
390         return valueFactory.createLiteral(s);
391     }
392 
393     /**
394      *
395      * Returns all the available {@link RDFFormat}s.
396      *
397      * @return an unmodifiable collection of formats.
398      */
399     public static Collection<RDFFormat> getFormats() {
400         return RDFParserRegistry.getInstance().getKeys();
401     }
402 
403     /**
404      * Creates a new {@link RDFParser} instance.
405      *
406      * @param format parser format.
407      * @return parser instance.
408      * @throws IllegalArgumentException if format is not supported.
409      */
410     public static RDFParser getParser(RDFFormat format) {
411         return Rio.createParser(format);
412     }
413 
414     /**
415      * Creates a new {@link RDFWriter} instance.
416      *
417      * @param format output format.
418      * @param writer data output writer.
419      * @return writer instance.
420      * @throws IllegalArgumentException if format is not supported.
421      */
422     public static RDFWriter getWriter(RDFFormat format, Writer writer) {
423         return Rio.createWriter(format, writer);
424     }
425 
426     /**
427      * Creates a new {@link RDFWriter} instance.
428      *
429      * @param format output format.
430      * @param os output stream.
431      * @return writer instance.
432      * @throws IllegalArgumentException if format is not supported.
433      */
434     public static RDFWriter getWriter(RDFFormat format, OutputStream os) {
435         return Rio.createWriter(format, os);
436     }
437 
438     /**
439      * Returns a parser type from the given extension.
440      *
441      * @param ext input extension.
442      * @return parser matching the extension.
443      * @throws IllegalArgumentException if no extension matches.
444      */
445     public static Optional<RDFFormat> getFormatByExtension(String ext) {
446         if (!ext.startsWith("."))
447             ext = "." + ext;
448         return Rio.getParserFormatForFileName(ext);
449     }
450 
451     /**
452      * Parses the content of <code>is</code> input stream with the
453      * specified parser <code>p</code> using <code>baseIRI</code>.
454      *
455      * @param format input format type.
456      * @param is input stream containing <code>RDF</code>.
457      * @param baseIRI base uri.
458      * @return list of statements detected within the input stream.
459      * @throws IOException if there is an error reading the {@link java.io.InputStream}
460      */
461     public static Statement[] parseRDF(RDFFormat format, InputStream is, String baseIRI)
462     throws IOException {
463         final StatementCollector handler = new StatementCollector();
464         final RDFParser parser = getParser(format);
465         parser.getParserConfig().set(BasicParserSettings.VERIFY_DATATYPE_VALUES, true);
466         parser.setPreserveBNodeIDs(true);
467         parser.setRDFHandler(handler);
468         parser.parse(is, baseIRI);
469         return handler.getStatements().toArray(EMPTY_STATEMENTS);
470     }
471 
472     /**
473      * Parses the content of <code>is</code> input stream with the
474      * specified parser <code>p</code> using <code>''</code> as base org.eclipse.rdf4j.model.IRI.
475      *
476      * @param format input format type.
477      * @param is input stream containing <code>RDF</code>.
478      * @return list of statements detected within the input stream.
479      * @throws IOException if there is an error reading the {@link java.io.InputStream}
480      */
481     public static Statement[] parseRDF(RDFFormat format, InputStream is)
482     throws IOException {
483         return parseRDF(format, is, "");
484     }
485 
486     /**
487      * Parses the content of <code>in</code> string with the
488      * specified parser <code>p</code> using <code>''</code> as base org.eclipse.rdf4j.model.IRI.
489      *
490      * @param format input format type.
491      * @param in input string containing <code>RDF</code>.
492      * @return list of statements detected within the input string.
493      * @throws IOException if there is an error reading the {@link java.io.InputStream}
494      */
495     public static Statement[] parseRDF(RDFFormat format, String in)
496     throws IOException {
497         return parseRDF(format, new ByteArrayInputStream(in.getBytes()));
498     }
499 
500     /**
501      * Parses the content of the <code>resource</code> file
502      * guessing the content format from the extension.
503      *
504      * @param resource resource name.
505      * @return the statements declared within the resource file.
506      * @throws java.io.IOException if an error occurs while reading file.
507      */
508     public static Statement[] parseRDF(String resource) throws IOException {
509         final int extIndex = resource.lastIndexOf('.');
510         if (extIndex == -1)
511             throw new IllegalArgumentException("Error while detecting the extension in resource name " + resource);
512         final String extension = resource.substring(extIndex + 1);
513         return parseRDF(getFormatByExtension(extension).orElseThrow(Rio.unsupportedFormat(extension)),
514                 RDFUtils.class.getResourceAsStream(resource));
515     }
516 
517     /**
518      * Checks if <code>href</code> is absolute or not.
519      *
520      * @param href candidate org.eclipse.rdf4j.model.IRI.
521      * @return <code>true</code> if <code>href</code> is absolute,
522      *         <code>false</code> otherwise.
523      */
524     public static boolean isAbsoluteIRI(String href) {
525         try {
526             SimpleValueFactory.getInstance().createIRI(href.trim());
527             new java.net.URI(href.trim());
528             return true;
529         } catch (IllegalArgumentException e) {
530             LOG.trace("Error processing href: {}", href, e);
531             return false;
532         } catch (URISyntaxException e) {
533             LOG.trace("Error interpreting href: {} as URI.", href, e);
534             return false;
535         }
536     }
537 
538     /**
539      * {@link #makeIRI(java.lang.String, org.eclipse.rdf4j.model.IRI, boolean) }.
540      * @param docUri
541      * @return instance of {@link Resource}.
542      */
543     public static Resource makeIRI(IRI docUri) {
544         return makeIRI("node", docUri);
545     }
546 
547     /**
548      * {@link #makeIRI(java.lang.String, org.eclipse.rdf4j.model.IRI, boolean) }.
549      * @param type
550      * @param docIRI
551      * @return instance of {@link Resource}.
552      */
553     public static Resource makeIRI(String type, IRI docIRI) {
554         return makeIRI(type, docIRI, false);
555     }
556 
557     /**
558      * Creates implementation of {@link Resource} from given arguments: <i>type</i> and <i>docIRI</i>.
559      * 
560      * <b>NB:</b> The Java Naming Conventions is described by <a href='http://www.geeksforgeeks.org/java-naming-conventions/'>GeeksForGeeks</a>.
561      * 
562      * @param type This argument is converted following Java naming conventions with {@link StringUtils#implementJavaNaming(java.lang.String) }.
563      * @param docIRI It is a namespace. If it ends with '/' character than stays unchanged otherwise the hash character '#' is added to the end.
564      * @param addId If argument is <b>TRUE</b> than the node identifier is added to the end formated <tt>'_{int}'</tt>.
565      * @return instance of {@link Resource}.
566      */
567     public static Resource makeIRI(String type, IRI docIRI, boolean addId) {
568 
569         // preprocess string: converts - -> _
570         //                    converts <space>: word1 word2 -> word1Word2
571         String newType = StringUtils.implementJavaNaming(type);
572 
573         String iriString;
574         if (docIRI.toString().endsWith("/") || docIRI.toString().endsWith("#")) {
575             iriString = docIRI.toString() + newType;
576         } else {
577             iriString = docIRI.toString() + "#" + newType;
578         }
579 
580         if (addId) {
581             iriString = iriString + "_" + Integer.toString(nodeId);
582         }
583 
584         Resource node = RDFUtils.iri(iriString);
585         if (addId) {
586             nodeId++;
587         }
588         return node;
589     }
590     
591     /**
592      * Convert string to either IRI or Literal.
593      * 
594      * If string value expresses valid IRI than {@link IRI} is created. Otherwise method 
595      * creates simple {@link Literal} xsd:string.
596      * 
597      * @param inString
598      * @return either {@link IRI} or {@link Literal}.
599      */
600     public static Value makeIRI(String inString) {
601         if (RDFUtils.isAbsoluteIRI(inString)) {
602             return RDFUtils.iri(inString);
603         } else {
604             return RDFUtils.literal(inString);
605         }
606     }
607     
608     public static Value makeIRI() {
609         BNode bnode = bnode(Integer.toString(nodeId));
610         nodeId++;
611         return bnode;
612     }
613 
614 }