View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdfa;
19  
20  import java.io.IOException;
21  import java.net.MalformedURLException;
22  import java.net.URISyntaxException;
23  import java.net.URL;
24  import java.util.ArrayList;
25  import java.util.HashMap;
26  import java.util.List;
27  import java.util.Locale;
28  import java.util.Map;
29  import java.util.Stack;
30  import javax.xml.transform.TransformerException;
31  import org.apache.any23.extractor.ExtractionResult;
32  import org.apache.any23.extractor.IssueReport;
33  import org.apache.any23.extractor.html.DomUtils;
34  import org.apache.any23.rdf.RDFUtils;
35  import org.eclipse.rdf4j.model.IRI;
36  import org.eclipse.rdf4j.model.Literal;
37  import org.eclipse.rdf4j.model.Resource;
38  import org.eclipse.rdf4j.model.Value;
39  import org.eclipse.rdf4j.model.vocabulary.RDF;
40  import org.slf4j.Logger;
41  import org.slf4j.LoggerFactory;
42  import org.w3c.dom.Document;
43  import org.w3c.dom.NamedNodeMap;
44  import org.w3c.dom.Node;
45  import org.w3c.dom.NodeList;
46  
47  /**
48   * This parser is able to extract <a href="http://www.w3.org/TR/rdfa-syntax/">RDFa 1.0</a> and
49   * <a href="http://www.w3.org/TR/rdfa-core/">RDFa 1.1</a> statements from any <i>(X)HTML</i> document.
50   * 
51   * @deprecated since 2.3 the {@link org.eclipse.rdf4j.rio.Rio} implementations are used to parse RDFa. Look at
52   *             {@link org.apache.any23.extractor.rdf.RDFParserFactory#getRDFa10Parser} and
53   *             {@link org.apache.any23.extractor.rdf.RDFParserFactory#getRDFa11Parser}.
54   * 
55   * @author Michele Mostarda (mostarda@fbk.eu)
56   */
57  @Deprecated
58  public class RDFa11Parser {
59  
60      private static final Logger logger = LoggerFactory.getLogger(RDFa11Parser.class);
61  
62      public static final String CURIE_SEPARATOR = ":";
63      public static final char IRI_PREFIX_SEPARATOR = ':';
64      public static final String IRI_SCHEMA_SEPARATOR = "://";
65      public static final String IRI_PATH_SEPARATOR = "/";
66  
67      public static final String HEAD_TAG = "HEAD";
68      public static final String BODY_TAG = "BODY";
69  
70      public static final String XMLNS_ATTRIBUTE = "xmlns";
71      public static final String XML_LANG_ATTRIBUTE = "xml:lang";
72  
73      public static final String REL_ATTRIBUTE = "rel";
74      public static final String REV_ATTRIBUTE = "rev";
75  
76      public static final String ABOUT_ATTRIBUTE = "about";
77      public static final String RESOURCE_ATTRIBUTE = "resource";
78      public static final String SRC_ATTRIBUTE = "src";
79      public static final String HREF_ATTRIBUTE = "href";
80  
81      public static final String TYPE_ATTRIBUTE = "type";
82      public static final String ATTRIBUTE_CSS = "text/css";
83  
84      public static final String[] SUBJECT_ATTRIBUTES = { ABOUT_ATTRIBUTE, SRC_ATTRIBUTE, RESOURCE_ATTRIBUTE,
85              HREF_ATTRIBUTE };
86  
87      public static final String PREFIX_ATTRIBUTE = "prefix";
88      public static final String TYPEOF_ATTRIBUTE = "typeof";
89      public static final String PROPERTY_ATTRIBUTE = "property";
90      public static final String DATATYPE_ATTRIBUTE = "datatype";
91      public static final String CONTENT_ATTRIBUTE = "content";
92      public static final String VOCAB_ATTRIBUTE = "vocab";
93      // TODO: introduce support for RDFa profiles. (http://www.w3.org/TR/rdfa-core/#s_profiles)
94      public static final String PROFILE_ATTRIBUTE = "profile";
95  
96      public static final String XML_LITERAL_DATATYPE = "rdf:XMLLiteral";
97  
98      public static final String XMLNS_DEFAULT = "http://www.w3.org/1999/xhtml";
99  
100     private IssueReport issueReport;
101 
102     private URL documentBase;
103 
104     private final Stack<IRIMapping> IRIMappingStack = new Stack<>();
105 
106     private final Stack<Vocabulary> vocabularyStack = new Stack<>();
107 
108     private final List<IncompleteTriple> listOfIncompleteTriples = new ArrayList<>();
109 
110     private final Stack<EvaluationContext> evaluationContextStack = new Stack<>();
111 
112     public RDFa11Parser() {
113         // default constructor
114     }
115 
116     protected static URL getDocumentBase(URL documentURL, Document document) throws MalformedURLException {
117         String base;
118         base = DomUtils.find(document, "/HTML/HEAD/BASE/@href"); // Non XHTML documents.
119         if (!"".equals(base))
120             return new URL(base);
121         base = DomUtils.find(document, "//*/h:head/h:base[position()=1]/@href"); // XHTML documents.
122         if (!"".equals(base))
123             return new URL(base);
124         return documentURL;
125     }
126 
127     /**
128      * Given a prefix declaration returns a list of <code>prefixID:prefixURL</code> strings normalizing blanks where
129      * present.
130      *
131      * @param prefixesDeclaration
132      *            input prefix
133      * 
134      * @return list of extracted prefixes.
135      */
136     protected static String[] extractPrefixSections(String prefixesDeclaration) {
137         final String[] parts = prefixesDeclaration.split("\\s");
138         final List<String> out = new ArrayList<>();
139         int i = 0;
140         while (i < parts.length) {
141             final String part = parts[i];
142             if (part.length() == 0) {
143                 i++;
144                 continue;
145             }
146             if (part.charAt(part.length() - 1) == IRI_PREFIX_SEPARATOR) {
147                 i++;
148                 while (i < parts.length && parts[i].length() == 0)
149                     i++;
150                 out.add(part + (i < parts.length ? parts[i] : ""));
151                 i++;
152             } else {
153                 out.add(parts[i]);
154                 i++;
155             }
156         }
157         return out.toArray(new String[out.size()]);
158     }
159 
160     protected static boolean isAbsoluteIRI(String iri) {
161         return iri.contains(IRI_SCHEMA_SEPARATOR);
162     }
163 
164     protected static boolean isCURIE(String curie) {
165         if (curie == null) {
166             throw new NullPointerException("curie string cannot be null.");
167         }
168         if (curie.trim().length() == 0)
169             return false;
170 
171         // '[' PREFIX ':' VALUE ']'
172         if (curie.charAt(0) != '[' || curie.charAt(curie.length() - 1) != ']')
173             return false;
174         int separatorIndex = curie.indexOf(CURIE_SEPARATOR);
175         return separatorIndex > 0 && curie.indexOf(CURIE_SEPARATOR, separatorIndex + 1) == -1;
176     }
177 
178     protected static boolean isCURIEBNode(String curie) {
179         return isCURIE(curie) && curie.substring(1, curie.length() - 1).split(CURIE_SEPARATOR)[0].equals("_");
180     }
181 
182     protected static boolean isRelativeNode(Node node) {
183         if (ATTRIBUTE_CSS.equals(DomUtils.readAttribute(node, TYPE_ATTRIBUTE)))
184             return false;
185         return DomUtils.hasAttribute(node, REL_ATTRIBUTE) || DomUtils.hasAttribute(node, REV_ATTRIBUTE);
186     }
187 
188     // RDFa1.0[5.5.9.2]
189     protected static Literal getAsPlainLiteral(Node node, String currentLanguage) {
190         final String content = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null);
191         if (content != null)
192             return RDFUtils.literal(content, currentLanguage);
193 
194         if (!node.hasChildNodes())
195             return RDFUtils.literal("", currentLanguage);
196 
197         final String nodeTextContent = node.getTextContent();
198         return nodeTextContent == null ? null : RDFUtils.literal(nodeTextContent.trim(), currentLanguage);
199     }
200 
201     protected static Literal getAsXMLLiteral(Node node) throws IOException, TransformerException {
202         final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null);
203         if (!XML_LITERAL_DATATYPE.equals(datatype))
204             return null;
205 
206         final String xmlSerializedNode = DomUtils.serializeToXML(node, false);
207         return RDFUtils.literal(xmlSerializedNode, RDF.XMLLITERAL);
208     }
209 
210     protected static boolean isXMLNSDeclared(Document document) {
211         final String attributeValue = document.getDocumentElement().getAttribute(XMLNS_ATTRIBUTE);
212         if (attributeValue.length() == 0)
213             return false;
214         return XMLNS_DEFAULT.equals(attributeValue);
215     }
216 
217     /**
218      * <a href="http://www.w3.org/TR/rdfa-syntax/#s_model">RDFa Syntax - Processing Model</a>.
219      *
220      * @param documentURL
221      *            {@link java.net.URL} of the document to process
222      * @param extractionResult
223      *            a {@link org.apache.any23.extractor.ExtractionResult} to populate
224      * @param document
225      *            the {@link org.w3c.dom.Document} to populate with parse content
226      * 
227      * @throws RDFa11ParserException
228      *             if there is an error parsing the document
229      */
230     public void processDocument(URL documentURL, Document document, ExtractionResult extractionResult)
231             throws RDFa11ParserException {
232         try {
233             this.issueReport = extractionResult;
234 
235             // Check RDFa1.0[4.1.3] : default XMLNS declaration.
236             if (!isXMLNSDeclared(document)) {
237                 reportError(document.getDocumentElement(),
238                         String.format(Locale.ROOT,
239                                 "The default %s namespace is expected to be declared and equal to '%s' .",
240                                 XMLNS_ATTRIBUTE, XMLNS_DEFAULT));
241             }
242 
243             try {
244                 documentBase = getDocumentBase(documentURL, document);
245             } catch (MalformedURLException murle) {
246                 throw new RDFa11ParserException("Invalid document base URL.", murle);
247             }
248 
249             // RDFa1.0[5.5.1]
250             pushContext(document, new EvaluationContext(documentBase));
251 
252             depthFirstNode(document, extractionResult);
253 
254             assert listOfIncompleteTriples
255                     .isEmpty() : "The list of incomplete triples is expected to be empty at the end of processing.";
256         } finally {
257             reset();
258         }
259     }
260 
261     /**
262      * Resets the parser to the original state.
263      */
264     public void reset() {
265         issueReport = null;
266         documentBase = null;
267         IRIMappingStack.clear();
268         listOfIncompleteTriples.clear();
269         evaluationContextStack.clear();
270     }
271 
272     /**
273      * Updates the vocabulary context with possible <em>@vocab</em> declarations.
274      *
275      * @param currentNode
276      *            the current node.
277      */
278     protected void updateVocabulary(Node currentNode) {
279         final String vocabularyStr = DomUtils.readAttribute(currentNode, VOCAB_ATTRIBUTE, null);
280         if (vocabularyStr == null)
281             return;
282         try {
283             pushVocabulary(currentNode, RDFUtils.iri(vocabularyStr));
284         } catch (Exception e) {
285             reportError(currentNode,
286                     String.format(Locale.ROOT, "Invalid vocabulary [%s], must be a IRI.", vocabularyStr));
287         }
288     }
289 
290     /**
291      * Updates the IRI mapping with the XMLNS attributes declared in the current node.
292      *
293      * @param node
294      *            input node.
295      */
296     protected void updateIRIMapping(Node node) {
297         final NamedNodeMap attributes = node.getAttributes();
298         if (null == attributes)
299             return;
300 
301         Node attribute;
302         final List<PrefixMap> prefixMapList = new ArrayList<PrefixMap>();
303         final String namespacePrefix = XMLNS_ATTRIBUTE + IRI_PREFIX_SEPARATOR;
304         for (int a = 0; a < attributes.getLength(); a++) {
305             attribute = attributes.item(a);
306             if (attribute.getNodeName().startsWith(namespacePrefix)) {
307                 prefixMapList.add(new PrefixMap(attribute.getNodeName().substring(namespacePrefix.length()),
308                         resolveIRI(attribute.getNodeValue())));
309             }
310         }
311 
312         extractPrefixes(node, prefixMapList);
313 
314         if (prefixMapList.size() == 0)
315             return;
316         pushMappings(node, prefixMapList);
317     }
318 
319     /**
320      * Returns a IRI mapping for a given prefix.
321      *
322      * @param prefix
323      *            input prefix.
324      * 
325      * @return IRI mapping.
326      */
327     protected IRI getMapping(String prefix) {
328         for (IRIMapping IRIMapping : IRIMappingStack) {
329             final IRI mapping = IRIMapping.map.get(prefix);
330             if (mapping != null) {
331                 return mapping;
332             }
333         }
334         return null;
335     }
336 
337     /**
338      * Resolves a <em>whitelist</em> separated list of <i>CURIE</i> or <i>URI</i>.
339      *
340      * @param n
341      *            current node.
342      * @param curieOrIRIList
343      *            list of CURIE/URI.
344      * @param termAllowed
345      *            determine whether the term should be whitelisted.
346      * 
347      * @return list of resolved URIs.
348      * 
349      * @throws URISyntaxException
350      *             if there is an error processing CURIE or URL
351      */
352     protected IRI[] resolveCIRIeOrIRIList(Node n, String curieOrIRIList, boolean termAllowed)
353             throws URISyntaxException {
354         if (curieOrIRIList == null || curieOrIRIList.trim().length() == 0)
355             return new IRI[0];
356 
357         final String[] curieOrIRIListParts = curieOrIRIList.split("\\s");
358         final List<IRI> result = new ArrayList<>();
359         Resource curieOrIRI;
360         for (String curieORIRIListPart : curieOrIRIListParts) {
361             curieOrIRI = resolveCURIEOrIRI(curieORIRIListPart, termAllowed);
362             if (curieOrIRI != null && curieOrIRI instanceof IRI) {
363                 result.add((IRI) curieOrIRI);
364             } else {
365                 reportError(n, String.format(Locale.ROOT, "Invalid CURIE '%s' : expected IRI, found BNode.",
366                         curieORIRIListPart));
367             }
368         }
369         return result.toArray(new IRI[result.size()]);
370     }
371 
372     /**
373      * Resolves a IRI string as IRI.
374      *
375      * @param iriStr
376      *            (partial) IRI string to be resolved.
377      * 
378      * @return the resolved IRI.
379      */
380     protected IRI resolveIRI(String iriStr) {
381         return isAbsoluteIRI(iriStr) ? RDFUtils.iri(iriStr) : RDFUtils.iri(this.documentBase.toExternalForm(), iriStr);
382     }
383 
384     /**
385      * Resolves a <i>CURIE</i> or <i>IRI</i> string.
386      *
387      * @param curieOrIRI
388      *            individual of CURIE/URI to resolve
389      * @param termAllowed
390      *            if <code>true</code> the resolution can be a term.
391      * 
392      * @return the resolved resource.
393      */
394     protected Resource resolveCURIEOrIRI(String curieOrIRI, boolean termAllowed) {
395         if (isCURIE(curieOrIRI)) {
396             return resolveNamespacedIRI(curieOrIRI.substring(1, curieOrIRI.length() - 1), ResolutionPolicy.NSRequired);
397         }
398         if (isAbsoluteIRI(curieOrIRI))
399             return resolveIRI(curieOrIRI);
400         return resolveNamespacedIRI(curieOrIRI,
401                 termAllowed ? ResolutionPolicy.TermAllowed : ResolutionPolicy.NSNotRequired);
402     }
403 
404     /**
405      * Pushes a context whiting the evaluation context stack, associated to tha given generation node.
406      *
407      * @param current
408      * @param ec
409      */
410     private void pushContext(Node current, EvaluationContext ec) {
411         ec.node = current;
412         evaluationContextStack.push(ec);
413     }
414 
415     /**
416      * @return the peek evaluation context.
417      */
418     private EvaluationContext getContext() {
419         return evaluationContextStack.peek();
420     }
421 
422     /**
423      * Pops out the peek evaluation context if ancestor of current node.
424      *
425      * @param current
426      *            current node.
427      */
428     private void popContext(Node current) {
429         final Node peekNode = evaluationContextStack.peek().node;
430         if (DomUtils.isAncestorOf(peekNode, current)) {
431             evaluationContextStack.pop();
432         }
433     }
434 
435     /**
436      * Pushes a new vocabulary definition.
437      *
438      * @param currentNode
439      *            node proving the vocabulary.
440      * @param vocab
441      *            the vocabulary IRI.
442      */
443     private void pushVocabulary(Node currentNode, IRI vocab) {
444         vocabularyStack.push(new Vocabulary(currentNode, vocab));
445     }
446 
447     /**
448      * @return the current peek vocabulary.
449      */
450     private IRI getVocabulary() {
451         if (vocabularyStack.isEmpty())
452             return null;
453         return vocabularyStack.peek().prefix;
454     }
455 
456     /**
457      * Pops out the vocabulary definition.
458      *
459      * @param current
460      */
461     private void popVocabulary(Node current) {
462         if (vocabularyStack.isEmpty())
463             return;
464         if (DomUtils.isAncestorOf(current, vocabularyStack.peek().originatingNode)) {
465             vocabularyStack.pop();
466         }
467     }
468 
469     /**
470      * Purge all incomplete triples originated from a node that is descendant of <code>current</code>.
471      *
472      * @param current
473      */
474     private void purgeIncompleteTriples(Node current) {
475         final List<IncompleteTriple> toBePurged = new ArrayList<>();
476         for (IncompleteTriple incompleteTriple : listOfIncompleteTriples) {
477             if (DomUtils.isAncestorOf(current, incompleteTriple.originatingNode, true)) {
478                 toBePurged.add(incompleteTriple);
479             }
480         }
481         listOfIncompleteTriples.removeAll(toBePurged);
482         toBePurged.clear();
483     }
484 
485     /**
486      * Reports an error to the error reporter.
487      *
488      * @param n
489      *            originating node.
490      * @param msg
491      *            human readable message.
492      */
493     private void reportError(Node n, String msg) {
494         final String errorMsg = String.format(Locale.ROOT, "Error while processing node [%s] : '%s'",
495                 DomUtils.getXPathForNode(n), msg);
496         final int[] errorLocation = DomUtils.getNodeLocation(n);
497         this.issueReport.notifyIssue(IssueReport.IssueLevel.WARNING, errorMsg,
498                 errorLocation == null ? -1 : errorLocation[0], errorLocation == null ? -1 : errorLocation[1]);
499     }
500 
501     /**
502      * Performs a <i>deep-first</i> tree visit on the given root node.
503      *
504      * @param node
505      *            root node.
506      * @param extractionResult
507      */
508     private void depthFirstNode(Node node, ExtractionResult extractionResult) {
509         try {
510             processNode(node, extractionResult);
511         } catch (Exception e) {
512             if (logger.isDebugEnabled())
513                 logger.debug("Error while processing node.", e);
514             reportError(node, e.getMessage());
515         }
516         depthFirstChildren(node.getChildNodes(), extractionResult);
517         purgeIncompleteTriples(node);
518     }
519 
520     /**
521      * Performs a <i>deep-first</i> children list visit.
522      *
523      * @param nodeList
524      * @param extractionResult
525      */
526     private void depthFirstChildren(NodeList nodeList, ExtractionResult extractionResult) {
527         for (int i = 0; i < nodeList.getLength(); i++) {
528             final Node child = nodeList.item(i);
529             depthFirstNode(child, extractionResult);
530             popMappings(child);
531             popVocabulary(child);
532             popContext(child);
533         }
534     }
535 
536     /**
537      * Writes a triple on the extraction result.
538      *
539      * @param s
540      * @param p
541      * @param o
542      * @param extractionResult
543      */
544     private void writeTriple(Resource s, IRI p, Value o, ExtractionResult extractionResult) {
545         assert s != null : "subject   is null.";
546         assert p != null : "predicate is null.";
547         assert o != null : "object    is null.";
548         extractionResult.writeTriple(s, p, o);
549     }
550 
551     /**
552      * Processes the current node on the extraction algorithm. All the steps of this algorithm are annotated with the
553      * specification and section which describes it. The annotation is at form
554      * <em>RDFa&lt;spec-version%gt;[&lt;section&gt;]</em>
555      *
556      * @param currentElement
557      * @param extractionResult
558      * 
559      * @throws Exception
560      */
561     // TODO: add references to the RDFa 1.1 algorithm.
562     private void processNode(Node currentElement, ExtractionResult extractionResult) throws Exception {
563         final EvaluationContext currentEvaluationContext = getContext();
564         try {
565             if (currentElement.getNodeType() != Node.DOCUMENT_NODE && currentElement.getNodeType() != Node.ELEMENT_NODE)
566                 return;
567 
568             // RDFa1.1[7.5.3]
569             updateVocabulary(currentElement);
570 
571             // RDFa1.0[5.5.2] / RDFa1.1[7.5.4]
572             // Node currentElement = node;
573             updateIRIMapping(currentElement);
574 
575             // RDFa1.0[5.5.3] / RDFa1.1[7.5.5]
576             updateLanguage(currentElement, currentEvaluationContext);
577 
578             if (!isRelativeNode(currentElement)) {
579                 // RDFa1.0[5.5.4] / RDFa1.1[7.5.6]
580                 establishNewSubject(currentElement, currentEvaluationContext);
581             } else {
582                 // RDFa1.0[5.5.5] / RDFa1.1[7.5.7]
583                 establishNewSubjectCurrentObjectResource(currentElement, currentEvaluationContext);
584             }
585 
586             /*
587              * if(currentEvaluationContext.newSubject == null) { currentEvaluationContext.newSubject =
588              * resolveIRI(documentBase.toExternalForm()); } assert currentEvaluationContext.newSubject != null :
589              * "newSubject must be not null.";
590              */
591             if (currentEvaluationContext.newSubject == null)
592                 return;
593             if (logger.isDebugEnabled())
594                 logger.debug("newSubject: " + currentEvaluationContext.newSubject);
595 
596             // RDFa1.0[5.5.6] / RDFa1.1[7.5.8]
597             final IRI[] types = getTypes(currentElement);
598             for (IRI type : types) {
599                 writeTriple(currentEvaluationContext.newSubject, RDF.TYPE, type, extractionResult);
600             }
601 
602             // RDFa1.0[5.5.7] / RDFa1.1[7.5.9]
603             final IRI[] rels = getRels(currentElement);
604             final IRI[] revs = getRevs(currentElement);
605             if (currentEvaluationContext.currentObjectResource != null) {
606                 for (IRI rel : rels) {
607                     writeTriple(currentEvaluationContext.newSubject, rel,
608                             currentEvaluationContext.currentObjectResource, extractionResult);
609                 }
610                 for (IRI rev : revs) {
611                     writeTriple(currentEvaluationContext.currentObjectResource, rev,
612                             currentEvaluationContext.newSubject, extractionResult);
613                 }
614             } else { // RDFa1.0[5.5.8] / RDFa1.1[7.5.10]
615                 for (IRI rel : rels) {
616                     listOfIncompleteTriples.add(new IncompleteTriple(currentElement,
617                             currentEvaluationContext.newSubject, rel, IncompleteTripleDirection.Forward));
618                 }
619                 for (IRI rev : revs) {
620                     listOfIncompleteTriples.add(new IncompleteTriple(currentElement,
621                             currentEvaluationContext.newSubject, rev, IncompleteTripleDirection.Reverse));
622                 }
623             }
624 
625             // RDFa1.0[5.5.9] / RDFa1.1[7.5.11]
626             final Value currentObject = getCurrentObject(currentElement);
627             final IRI[] predicates = getPredicate(currentElement);
628             if (currentObject != null && predicates != null) {
629                 for (IRI predicate : predicates) {
630                     writeTriple(currentEvaluationContext.newSubject, predicate, currentObject, extractionResult);
631                 }
632             }
633 
634             // RDFa1.0[5.5.10] / RDFa1.1[7.5.12]
635             if (!currentEvaluationContext.skipElem && currentEvaluationContext.newSubject != null) {
636                 for (IncompleteTriple incompleteTriple : listOfIncompleteTriples) {
637                     incompleteTriple.produceTriple(currentElement, currentEvaluationContext.newSubject,
638                             extractionResult);
639                 }
640             }
641         } catch (Exception e) {
642             throw e;
643         } finally {
644             // RDFa1.0[5.5.11] / RDFa1.1[7.5.13]
645             if (currentEvaluationContext.recourse) {
646                 EvaluationContext newEvaluationContext = new EvaluationContext(currentEvaluationContext.base);
647                 if (currentEvaluationContext.skipElem) {
648                     newEvaluationContext.language = currentEvaluationContext.language;
649                 } else {
650                     newEvaluationContext.base = currentEvaluationContext.base;
651 
652                     if (currentEvaluationContext.newSubject != null) {
653                         newEvaluationContext.parentSubject = currentEvaluationContext.newSubject;
654                     } else {
655                         newEvaluationContext.parentSubject = currentEvaluationContext.parentSubject;
656                     }
657 
658                     if (currentEvaluationContext.currentObjectResource != null) {
659                         newEvaluationContext.parentObject = currentEvaluationContext.currentObjectResource;
660                     } else if (currentEvaluationContext.newSubject != null) {
661                         newEvaluationContext.parentObject = currentEvaluationContext.newSubject;
662                     } else {
663                         newEvaluationContext.parentObject = currentEvaluationContext.parentSubject;
664                     }
665 
666                     newEvaluationContext.language = currentEvaluationContext.language;
667                 }
668                 pushContext(currentElement, newEvaluationContext);
669             }
670         }
671     }
672 
673     /**
674      * Extract IRI namespaces (prefixes) from the current node.
675      *
676      * @param node
677      * @param prefixMapList
678      */
679     private void extractPrefixes(Node node, List<PrefixMap> prefixMapList) {
680         final String prefixAttribute = DomUtils.readAttribute(node, PREFIX_ATTRIBUTE, null);
681         if (prefixAttribute == null)
682             return;
683         final String[] prefixParts = extractPrefixSections(prefixAttribute);
684         for (String prefixPart : prefixParts) {
685             int splitPoint = prefixPart.indexOf(IRI_PREFIX_SEPARATOR);
686             final String prefix = prefixPart.substring(0, splitPoint);
687             if (prefix.length() == 0) {
688                 reportError(node,
689                         String.format(Locale.ROOT, "Invalid prefix length in prefix attribute '%s'", prefixAttribute));
690                 continue;
691             }
692             final IRI iri;
693             final String iriStr = prefixPart.substring(splitPoint + 1);
694             try {
695                 iri = resolveIRI(iriStr);
696             } catch (Exception e) {
697                 reportError(node, String.format(Locale.ROOT, "Resolution of prefix '%s' defines an invalid IRI: '%s'",
698                         prefixAttribute, iriStr));
699                 continue;
700             }
701             prefixMapList.add(new PrefixMap(prefix, iri));
702         }
703     }
704 
705     /**
706      * Updates the current language.
707      *
708      * @param node
709      * @param currentEvaluationContext
710      */
711     private void updateLanguage(Node node, EvaluationContext currentEvaluationContext) {
712         final String candidateLanguage = DomUtils.readAttribute(node, XML_LANG_ATTRIBUTE, null);
713         if (candidateLanguage != null)
714             currentEvaluationContext.language = candidateLanguage;
715     }
716 
717     /**
718      * Establish the new subject for the current recursion. See <i>RDFa 1.0 Specification section 5.5.4</i>, <i>RDFa 1.1
719      * Specification section 7.5.6</i>.
720      *
721      * @param node
722      * @param currentEvaluationContext
723      * 
724      * @throws URISyntaxException
725      */
726     private void establishNewSubject(Node node, EvaluationContext currentEvaluationContext) throws URISyntaxException {
727         String candidateIRIOrCURIE;
728         for (String subjectAttribute : SUBJECT_ATTRIBUTES) {
729             candidateIRIOrCURIE = DomUtils.readAttribute(node, subjectAttribute, null);
730             if (candidateIRIOrCURIE != null) {
731                 currentEvaluationContext.newSubject = resolveCURIEOrIRI(candidateIRIOrCURIE, false);
732                 return;
733             }
734         }
735 
736         if (node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) {
737             currentEvaluationContext.newSubject = resolveIRI(currentEvaluationContext.base.toString());
738             return;
739         }
740 
741         if (DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) {
742             currentEvaluationContext.newSubject = RDFUtils.bnode();
743             return;
744         }
745 
746         if (DomUtils.hasAttribute(node, PROPERTY_ATTRIBUTE)) {
747             currentEvaluationContext.skipElem = true;
748         }
749         if (currentEvaluationContext.parentObject != null) {
750             currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject;
751             return;
752         }
753 
754         currentEvaluationContext.newSubject = null;
755     }
756 
757     /**
758      * Establishes the new subject and the current object resource.
759      *
760      * See <i>RDFa 1.0 Specification section 5.5.5</i>, <i>RDFa 1.1 Specification section 7.5.7</i>.
761      *
762      * @param node
763      * @param currentEvaluationContext
764      * 
765      * @throws URISyntaxException
766      */
767     private void establishNewSubjectCurrentObjectResource(Node node, EvaluationContext currentEvaluationContext)
768             throws URISyntaxException {
769         // Subject.
770         String candidateIRIOrCURIE;
771         candidateIRIOrCURIE = DomUtils.readAttribute(node, ABOUT_ATTRIBUTE, null);
772         if (candidateIRIOrCURIE != null) {
773             currentEvaluationContext.newSubject = resolveCURIEOrIRI(candidateIRIOrCURIE, false);
774         } else {
775             candidateIRIOrCURIE = DomUtils.readAttribute(node, SRC_ATTRIBUTE, null);
776             if (candidateIRIOrCURIE != null) {
777                 currentEvaluationContext.newSubject = resolveIRI(candidateIRIOrCURIE);
778             } else {
779                 if (node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) {
780                     currentEvaluationContext.newSubject = resolveIRI(currentEvaluationContext.base.toString());
781                 } else {
782                     if (DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) {
783                         currentEvaluationContext.newSubject = RDFUtils.bnode();
784                     } else {
785                         if (currentEvaluationContext.parentObject != null) {
786                             currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject;
787                         }
788                     }
789                 }
790             }
791         }
792 
793         // Object.
794         candidateIRIOrCURIE = DomUtils.readAttribute(node, RESOURCE_ATTRIBUTE, null);
795         if (candidateIRIOrCURIE != null) {
796             currentEvaluationContext.currentObjectResource = resolveCURIEOrIRI(candidateIRIOrCURIE, false);
797             return;
798         }
799 
800         candidateIRIOrCURIE = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null);
801         if (candidateIRIOrCURIE != null) {
802             currentEvaluationContext.currentObjectResource = resolveIRI(candidateIRIOrCURIE);
803             return;
804         }
805         currentEvaluationContext.currentObjectResource = null;
806     }
807 
808     private IRI[] getTypes(Node node) throws URISyntaxException {
809         final String typeOf = DomUtils.readAttribute(node, TYPEOF_ATTRIBUTE, null);
810         return resolveCIRIeOrIRIList(node, typeOf, true);
811     }
812 
813     private IRI[] getRels(Node node) throws URISyntaxException {
814         final String rel = DomUtils.readAttribute(node, REL_ATTRIBUTE, null);
815         return resolveCIRIeOrIRIList(node, rel, true);
816     }
817 
818     private IRI[] getRevs(Node node) throws URISyntaxException {
819         final String rev = DomUtils.readAttribute(node, REV_ATTRIBUTE, null);
820         return resolveCIRIeOrIRIList(node, rev, true);
821     }
822 
823     private IRI[] getPredicate(Node node) throws URISyntaxException {
824         final String candidateIRI = DomUtils.readAttribute(node, PROPERTY_ATTRIBUTE, null);
825         if (candidateIRI == null)
826             return null;
827         return resolveCIRIeOrIRIList(node, candidateIRI, true);
828     }
829 
830     /**
831      * Establishes the new object value. See <i>RDFa 1.0 Specification section 5.5.9</i>, <i>RDFa 1.1 Specification
832      * section 7.5.11</i>.
833      *
834      * @param node
835      * 
836      * @return
837      * 
838      * @throws URISyntaxException
839      * @throws IOException
840      * @throws TransformerException
841      */
842     private Value getCurrentObject(Node node) throws URISyntaxException, IOException, TransformerException {
843         final String candidateObject = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null);
844         if (candidateObject != null) {
845             return resolveIRI(candidateObject);
846         } else {
847             return gerCurrentObjectLiteral(node);
848         }
849     }
850 
851     private Literal gerCurrentObjectLiteral(Node node) throws URISyntaxException, IOException, TransformerException {
852         final EvaluationContext currentEvaluationContext = getContext();
853         Literal literal;
854 
855         literal = getAsTypedLiteral(node);
856         if (literal != null)
857             return literal;
858 
859         literal = getAsXMLLiteral(node);
860         if (literal != null) {
861             currentEvaluationContext.recourse = false;
862             return literal;
863         }
864 
865         literal = getAsPlainLiteral(node, currentEvaluationContext.language);
866         if (literal != null)
867             return literal;
868 
869         return null;
870     }
871 
872     private static String getNodeContent(Node node) {
873         final String candidateContent = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null);
874         if (candidateContent != null)
875             return candidateContent;
876         return node.getTextContent();
877     }
878 
879     /**
880      * Extracts the current typed literal from the given node. See <i>RDFa 1.0 Specification section 5.5.9.1</i>.
881      *
882      * @param node
883      * 
884      * @return
885      * 
886      * @throws URISyntaxException
887      */
888     private Literal getAsTypedLiteral(Node node) throws URISyntaxException {
889         final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null);
890         if (datatype == null || datatype.trim().length() == 0 || XML_LITERAL_DATATYPE.equals(datatype.trim())) {
891             return null;
892         }
893         final Resource curieOrIRI = resolveCURIEOrIRI(datatype, true);
894         return RDFUtils.literal(getNodeContent(node), curieOrIRI instanceof IRI ? (IRI) curieOrIRI : null);
895     }
896 
897     private void pushMappings(Node sourceNode, List<PrefixMap> prefixMapList) {
898 
899         final Map<String, IRI> mapping = new HashMap<>();
900         for (PrefixMap prefixMap : prefixMapList) {
901             mapping.put(prefixMap.prefix, prefixMap.IRI);
902         }
903         IRIMappingStack.push(new IRIMapping(sourceNode, mapping));
904     }
905 
906     private void popMappings(Node node) {
907         if (IRIMappingStack.isEmpty())
908             return;
909         final IRIMapping peek = IRIMappingStack.peek();
910         if (!DomUtils.isAncestorOf(peek.sourceNode, node)) {
911             IRIMappingStack.pop();
912         }
913     }
914 
915     /**
916      * Resolve a namespaced IRI, if <code>safe</code> is <code>true</code> then the mapping must define a prefix,
917      * otherwise it is considered relative.
918      *
919      * @param mapping
920      * @param resolutionPolicy
921      * 
922      * @return
923      */
924     private Resource resolveNamespacedIRI(String mapping, ResolutionPolicy resolutionPolicy) {
925         if (mapping.indexOf(IRI_PATH_SEPARATOR) == 0) { // Begins with '/'
926             mapping = mapping.substring(1);
927         }
928 
929         final int prefixSeparatorIndex = mapping.indexOf(':');
930         if (prefixSeparatorIndex == -1) { // there is no prefix separator.
931             if (resolutionPolicy == ResolutionPolicy.NSRequired) {
932                 throw new IllegalArgumentException(
933                         String.format(Locale.ROOT, "Invalid mapping string [%s], must declare a prefix.", mapping));
934             }
935             if (resolutionPolicy == ResolutionPolicy.TermAllowed) {
936                 final IRI currentVocabulary = getVocabulary();
937                 // Mapping is a TERM.
938                 if (currentVocabulary != null) {
939                     return resolveIRI(currentVocabulary.toString() + mapping);
940                 }
941             }
942             return resolveIRI(documentBase.toString() + mapping);
943         }
944 
945         final String prefix = mapping.substring(0, prefixSeparatorIndex);
946         final IRI curieMapping = getMapping(prefix);
947         if (curieMapping == null) {
948             throw new IllegalArgumentException(String.format(Locale.ROOT, "Cannot map prefix '%s'", prefix));
949         }
950         final String candidateCURIEStr = curieMapping.toString() + mapping.substring(prefixSeparatorIndex + 1);
951         final java.net.URI candidateCURIE;
952         try {
953             candidateCURIE = new java.net.URI(candidateCURIEStr);
954         } catch (URISyntaxException IRIse) {
955             throw new IllegalArgumentException(String.format(Locale.ROOT, "Invalid CURIE '%s'", candidateCURIEStr));
956         }
957         return resolveIRI(candidateCURIE.isAbsolute() ? candidateCURIE.toString()
958                 : documentBase.toString() + candidateCURIE.toString());
959     }
960 
961     /**
962      * The resolution policy provided to the method {@link #resolveNamespacedIRI(String, ResolutionPolicy)}.
963      */
964     enum ResolutionPolicy {
965         NSNotRequired, NSRequired, TermAllowed
966     }
967 
968     /**
969      * Defines an evaluation context.
970      */
971     private class EvaluationContext {
972         private Node node;
973         private URL base;
974         private Resource parentSubject;
975         private Value parentObject;
976         private String language;
977         private boolean recourse;
978         private boolean skipElem;
979         private Resource newSubject;
980         private Resource currentObjectResource;
981 
982         /**
983          * Sections <em>RDFa1.0[5.5]</em>, <em>RDFa1.0[5.5.1]</em>, <em>RDFa1.1[7.5.1]</em> .
984          *
985          * @param base
986          */
987         EvaluationContext(URL base) {
988             this.base = base;
989             this.parentSubject = resolveIRI(base.toExternalForm());
990             this.parentObject = null;
991             this.language = null;
992             this.recourse = true;
993             this.skipElem = false;
994             this.newSubject = null;
995             this.currentObjectResource = null;
996         }
997     }
998 
999     /**
1000      * Defines a prefix mapping.
1001      */
1002     private static class PrefixMap {
1003         final String prefix;
1004         final IRI IRI;
1005 
1006         public PrefixMap(String prefix, IRI IRI) {
1007             this.prefix = prefix;
1008             this.IRI = IRI;
1009         }
1010     }
1011 
1012     /**
1013      * Defines a IRI mapping.
1014      */
1015     private static class IRIMapping {
1016         final Node sourceNode;
1017         final Map<String, IRI> map;
1018 
1019         public IRIMapping(Node sourceNode, Map<String, IRI> map) {
1020             this.sourceNode = sourceNode;
1021             this.map = map;
1022         }
1023     }
1024 
1025     /**
1026      * Defines the direction of an {@link IncompleteTriple}.
1027      */
1028     private enum IncompleteTripleDirection {
1029         Forward, Reverse
1030     }
1031 
1032     /**
1033      * Defines an incomplete triple.
1034      */
1035     private static class IncompleteTriple {
1036         final Node originatingNode;
1037         final Resource subject;
1038         final IRI predicate;
1039         final IncompleteTripleDirection direction;
1040 
1041         public IncompleteTriple(Node originatingNode, Resource subject, IRI predicate,
1042                 IncompleteTripleDirection direction) {
1043             if (originatingNode == null || subject == null || predicate == null || direction == null)
1044                 throw new IllegalArgumentException();
1045 
1046             this.originatingNode = originatingNode;
1047             this.subject = subject;
1048             this.predicate = predicate;
1049             this.direction = direction;
1050         }
1051 
1052         public boolean produceTriple(Node resourceNode, Resource r, ExtractionResult extractionResult) {
1053             if (!DomUtils.isAncestorOf(originatingNode, resourceNode, true))
1054                 return false;
1055 
1056             if (r == null)
1057                 throw new IllegalArgumentException();
1058             switch (direction) {
1059             case Forward:
1060                 extractionResult.writeTriple(subject, predicate, r);
1061                 break;
1062             case Reverse:
1063                 extractionResult.writeTriple(r, predicate, subject);
1064                 break;
1065             default:
1066                 throw new IllegalStateException();
1067             }
1068             return true;
1069         }
1070     }
1071 
1072     /**
1073      * Defines a vocabulary object.
1074      */
1075     private static class Vocabulary {
1076         final Node originatingNode;
1077         final IRI prefix;
1078 
1079         public Vocabulary(Node originatingNode, IRI prefix) {
1080             this.originatingNode = originatingNode;
1081             this.prefix = prefix;
1082         }
1083     }
1084 
1085 }