View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdfa;
19  
20  import org.apache.any23.extractor.IssueReport;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.html.DomUtils;
23  import org.apache.any23.rdf.RDFUtils;
24  import org.openrdf.model.Literal;
25  import org.openrdf.model.Resource;
26  import org.openrdf.model.URI;
27  import org.openrdf.model.Value;
28  import org.openrdf.model.vocabulary.RDF;
29  import org.slf4j.Logger;
30  import org.slf4j.LoggerFactory;
31  import org.w3c.dom.Document;
32  import org.w3c.dom.NamedNodeMap;
33  import org.w3c.dom.Node;
34  import org.w3c.dom.NodeList;
35  
36  import javax.xml.transform.TransformerException;
37  import java.io.IOException;
38  import java.net.MalformedURLException;
39  import java.net.URISyntaxException;
40  import java.net.URL;
41  import java.util.ArrayList;
42  import java.util.HashMap;
43  import java.util.List;
44  import java.util.Map;
45  import java.util.Stack;
46  
47  /**
48   * This parser is able to extract <a href="http://www.w3.org/TR/rdfa-syntax/">RDFa 1.0</a> and
49   * <a href="http://www.w3.org/TR/rdfa-core/">RDFa 1.1</a> statements from any <i>(X)HTML</i> document.
50   *
51   * @author Michele Mostarda (mostarda@fbk.eu)
52   */
53  public class RDFa11Parser {
54  
55      private static final Logger logger = LoggerFactory.getLogger(RDFa11Parser.class);
56  
57      public static final String CURIE_SEPARATOR      = ":";
58      public static final char   URI_PREFIX_SEPARATOR = ':';
59      public static final String URI_SCHEMA_SEPARATOR = "://";
60      public static final String URI_PATH_SEPARATOR   = "/";
61  
62      public static final String HEAD_TAG = "HEAD";
63      public static final String BODY_TAG = "BODY";
64  
65      public static final String XMLNS_ATTRIBUTE    = "xmlns";
66      public static final String XML_LANG_ATTRIBUTE = "xml:lang";
67  
68      public static final String REL_ATTRIBUTE      = "rel";
69      public static final String REV_ATTRIBUTE      = "rev";
70  
71      public static final String ABOUT_ATTRIBUTE    = "about";
72      public static final String RESOURCE_ATTRIBUTE = "resource";
73      public static final String SRC_ATTRIBUTE      = "src";
74      public static final String HREF_ATTRIBUTE     = "href";
75  
76      public static final String TYPE_ATTRIBUTE     = "type";
77      public static final String ATTRIBUTE_CSS      = "text/css";
78  
79      public static final String[] SUBJECT_ATTRIBUTES = {
80              ABOUT_ATTRIBUTE,
81              SRC_ATTRIBUTE,
82              RESOURCE_ATTRIBUTE,
83              HREF_ATTRIBUTE
84      };
85  
86      public static final String PREFIX_ATTRIBUTE   = "prefix";
87      public static final String TYPEOF_ATTRIBUTE   = "typeof";
88      public static final String PROPERTY_ATTRIBUTE = "property";
89      public static final String DATATYPE_ATTRIBUTE = "datatype";
90      public static final String CONTENT_ATTRIBUTE  = "content";
91      public static final String VOCAB_ATTRIBUTE    = "vocab";
92      // TODO: introduce support for RDFa profiles. (http://www.w3.org/TR/rdfa-core/#s_profiles)
93      public static final String PROFILE_ATTRIBUTE  = "profile";
94  
95      public static final String XML_LITERAL_DATATYPE = "rdf:XMLLiteral";
96  
97      public static final String XMLNS_DEFAULT = "http://www.w3.org/1999/xhtml";
98  
99      private IssueReport issueReport;
100 
101     private URL documentBase;
102 
103     private final Stack<URIMapping> uriMappingStack = new Stack<URIMapping>();
104 
105     private final Stack<Vocabulary> vocabularyStack = new Stack<Vocabulary>();
106 
107     private final List<IncompleteTriple> listOfIncompleteTriples = new ArrayList<IncompleteTriple>();
108 
109     private final Stack<EvaluationContext> evaluationContextStack = new Stack<EvaluationContext>();
110 
111     protected static URL getDocumentBase(URL documentURL, Document document) throws MalformedURLException {
112         String base;
113         base = DomUtils.find(document, "/HTML/HEAD/BASE/@href");                  // Non XHTML documents.
114         if( ! "".equals(base) ) return new URL(base);
115         base = DomUtils.find(document, "//*/h:head/h:base[position()=1]/@href");  // XHTML documents.
116         if( ! "".equals(base) ) return new URL(base);
117         return documentURL;
118     }
119 
120     /**
121      * Given a prefix declaration returns a list of <code>prefixID:prefixURL</code> strings
122      * normalizing blanks where present.
123      *
124      * @param prefixesDeclaration
125      * @return list of extracted prefixes.
126      */
127     protected static String[] extractPrefixSections(String prefixesDeclaration) {
128         final String[] parts = prefixesDeclaration.split("\\s");
129         final List<String> out = new ArrayList<String>();
130         int i = 0;
131         while(i < parts.length) {
132             final String part = parts[i];
133             if(part.length() == 0) {
134                 i++;
135                 continue;
136             }
137             if(part.charAt( part.length() -1 ) == URI_PREFIX_SEPARATOR) {
138                 i++;
139                 while(i < parts.length && parts[i].length() == 0) i++;
140                 out.add( part + (i < parts.length ? parts[i] : "") );
141                 i++;
142             } else {
143                 out.add(parts[i]);
144                 i++;
145             }
146         }
147         return out.toArray( new String[out.size()] );
148     }
149 
150     protected static boolean isAbsoluteURI(String uri) {
151         return uri.contains(URI_SCHEMA_SEPARATOR);
152     }
153 
154     protected static boolean isCURIE(String curie) {
155         if(curie == null) {
156             throw new NullPointerException("curie string cannot be null.");
157         }
158         if(curie.trim().length() == 0) return false;
159 
160         // '[' PREFIX ':' VALUE ']'
161         if( curie.charAt(0) != '[' || curie.charAt(curie.length() -1) != ']') return false;
162         int separatorIndex = curie.indexOf(CURIE_SEPARATOR);
163         return separatorIndex > 0 && curie.indexOf(CURIE_SEPARATOR, separatorIndex + 1) == -1;
164     }
165 
166     protected static boolean isCURIEBNode(String curie) {
167         return isCURIE(curie) && curie.substring(1, curie.length() -1).split(CURIE_SEPARATOR)[0].equals("_");
168     }
169 
170     protected static boolean isRelativeNode(Node node) {
171         if( ATTRIBUTE_CSS.equals( DomUtils.readAttribute(node, TYPE_ATTRIBUTE) ) ) return false;
172         return DomUtils.hasAttribute(node, REL_ATTRIBUTE) || DomUtils.hasAttribute(node, REV_ATTRIBUTE);
173     }
174 
175     // RDFa1.0[5.5.9.2]
176     protected static Literal getAsPlainLiteral(Node node, String currentLanguage) {
177         final String content = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null);
178         if(content != null) return RDFUtils.literal(content, currentLanguage);
179 
180         if(! node.hasChildNodes() ) return RDFUtils.literal("", currentLanguage);
181 
182         final String nodeTextContent = node.getTextContent();
183         return nodeTextContent == null ? null : RDFUtils.literal(nodeTextContent.trim(), currentLanguage);
184     }
185 
186     protected static Literal getAsXMLLiteral(Node node) throws IOException, TransformerException {
187         final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null);
188         if(! XML_LITERAL_DATATYPE.equals(datatype)) return null;
189 
190         final String xmlSerializedNode = DomUtils.serializeToXML(node, false);
191         return RDFUtils.literal(xmlSerializedNode, RDF.XMLLITERAL);
192     }
193 
194     protected static boolean isXMLNSDeclared(Document document) {
195         final String attributeValue = document.getDocumentElement().getAttribute(XMLNS_ATTRIBUTE);
196         if(attributeValue.length() == 0) return false;
197         return XMLNS_DEFAULT.equals(attributeValue);
198     }
199 
200     public RDFa11Parser() {}
201 
202     /**
203      * <a href="http://www.w3.org/TR/rdfa-syntax/#s_model">RDFa Syntax - Processing Model</a>.
204      *
205      * @param documentURL
206      * @param extractionResult
207      * @param document
208      */
209     public void processDocument(URL documentURL, Document document, ExtractionResult extractionResult)
210     throws RDFa11ParserException {
211         try {
212             this.issueReport = extractionResult;
213 
214             // Check RDFa1.0[4.1.3] : default XMLNS declaration.
215             if( ! isXMLNSDeclared(document)) {
216                 reportError(
217                         document.getDocumentElement(),
218                         String.format(
219                                 "The default %s namespace is expected to be declared and equal to '%s' .",
220                                 XMLNS_ATTRIBUTE, XMLNS_DEFAULT
221                         )
222                 );
223             }
224 
225             try {
226                 documentBase = getDocumentBase(documentURL, document);
227             } catch (MalformedURLException murle) {
228                 throw new RDFa11ParserException("Invalid document base URL.", murle);
229             }
230 
231             // RDFa1.0[5.5.1]
232             pushContext(document, new EvaluationContext(documentBase));
233 
234             depthFirstNode(document, extractionResult);
235 
236             assert listOfIncompleteTriples.isEmpty()
237                     :
238                    "The list of incomplete triples is expected to be empty at the end of processing.";
239         } finally {
240             reset();
241         }
242     }
243 
244     /**
245      * Resets the parser to the original state.
246      */
247     public void reset() {
248         issueReport = null;
249         documentBase  = null;
250         uriMappingStack.clear();
251         listOfIncompleteTriples.clear();
252         evaluationContextStack.clear();
253     }
254 
255     /**
256      * Updates the vocabulary context with possible <em>@vocab</em> declarations.
257      *
258      * @param currentNode the current node.
259      */
260     protected void updateVocabulary(Node currentNode) {
261         final String vocabularyStr = DomUtils.readAttribute(currentNode, VOCAB_ATTRIBUTE, null);
262         if(vocabularyStr == null) return;
263         try {
264             pushVocabulary(currentNode, RDFUtils.uri(vocabularyStr));
265         } catch (Exception e) {
266             reportError(currentNode, String.format("Invalid vocabulary [%s], must be a URI.", vocabularyStr));
267         }
268     }
269 
270     /**
271      * Updates the URI mapping with the XMLNS attributes declared in the current node.
272      *
273      * @param node input node.
274      */
275     protected void updateURIMapping(Node node) {
276         final NamedNodeMap attributes = node.getAttributes();
277         if (null == attributes) return;
278 
279         Node attribute;
280         final List<PrefixMap> prefixMapList = new ArrayList<PrefixMap>();
281         final String namespacePrefix = XMLNS_ATTRIBUTE + URI_PREFIX_SEPARATOR;
282         for (int a = 0; a < attributes.getLength(); a++) {
283             attribute = attributes.item(a);
284             if (attribute.getNodeName().startsWith(namespacePrefix)) {
285                 prefixMapList.add(
286                         new PrefixMap(
287                             attribute.getNodeName().substring(namespacePrefix.length()),
288                             resolveURI(attribute.getNodeValue())
289                         )
290                 );
291             }
292         }
293 
294         extractPrefixes(node, prefixMapList);
295 
296         if(prefixMapList.size() == 0) return;
297         pushMappings(
298                 node,
299                 prefixMapList
300         );
301     }
302 
303     /**
304      * Returns a URI mapping for a given prefix.
305      *
306      * @param prefix input prefix.
307      * @return URI mapping.
308      */
309     protected URI getMapping(String prefix) {
310         for (URIMapping uriMapping : uriMappingStack) {
311             final URI mapping = uriMapping.map.get(prefix);
312             if (mapping != null) {
313                 return mapping;
314             }
315         }
316         return null;
317     }
318 
319     /**
320      * Resolves a <rm>whitelist</em> separated list of <i>CURIE</i> or <i>URI</i>.
321      *
322      * @param n current node.
323      * @param curieOrURIList list of CURIE/URI.
324      * @return list of resolved URIs.
325      * @throws URISyntaxException
326      */
327     protected URI[] resolveCurieOrURIList(Node n, String curieOrURIList, boolean termAllowed)
328     throws URISyntaxException {
329         if(curieOrURIList == null || curieOrURIList.trim().length() == 0) return new URI[0];
330 
331         final String[] curieOrURIListParts = curieOrURIList.split("\\s");
332         final List<URI> result = new ArrayList<URI>();
333         Resource curieOrURI;
334         for(String curieORURIListPart : curieOrURIListParts) {
335             curieOrURI = resolveCURIEOrURI(curieORURIListPart, termAllowed);
336             if(curieOrURI != null && curieOrURI instanceof URI) {
337                 result.add((URI) curieOrURI);
338             } else {
339                 reportError(n, String.format("Invalid CURIE '%s' : expected URI, found BNode.", curieORURIListPart));
340             }
341         }
342         return result.toArray(new URI[result.size()]);
343     }
344 
345     /**
346      * Resolves a URI string as URI.
347      *
348      * @param uriStr (partial) URI string to be resolved.
349      * @return the resolved URI.
350      */
351     protected URI resolveURI(String uriStr) {
352         return
353                 isAbsoluteURI(uriStr)
354                         ?
355                 RDFUtils.uri(uriStr)
356                         :
357                 RDFUtils.uri( this.documentBase.toExternalForm(), uriStr );
358     }
359 
360     /**
361      * Resolves a <i>CURIE</i> or <i>URI</i> string.
362      *
363      * @param curieOrURI
364      * @param termAllowed if <code>true</code> the resolution can be a term.
365      * @return the resolved resource.
366      */
367     protected Resource resolveCURIEOrURI(String curieOrURI, boolean termAllowed) {
368         if( isCURIE(curieOrURI) ) {
369             return resolveNamespacedURI(curieOrURI.substring(1, curieOrURI.length() - 1), ResolutionPolicy.NSRequired);
370         }
371         if(isAbsoluteURI(curieOrURI)) return resolveURI(curieOrURI);
372         return resolveNamespacedURI(
373                 curieOrURI,
374                 termAllowed ? ResolutionPolicy.TermAllowed : ResolutionPolicy.NSNotRequired
375         );
376     }
377 
378     /**
379      * Pushes a context whiting the evaluation context stack, associated to tha given generation node.
380      *
381      * @param current
382      * @param ec
383      */
384     private void pushContext(Node current, EvaluationContext ec) {
385         ec.node = current;
386         evaluationContextStack.push(ec);
387     }
388 
389     /**
390      * @return the peek evaluation context.
391      */
392     private EvaluationContext getContext() {
393         return evaluationContextStack.peek();
394     }
395 
396     /**
397      * Pops out the peek evaluation context if ancestor of current node.
398      *
399      * @param current current node.
400      */
401     private void popContext(Node current) {
402         final Node peekNode = evaluationContextStack.peek().node;
403         if(DomUtils.isAncestorOf(peekNode, current)) {
404             evaluationContextStack.pop();
405         }
406     }
407 
408     /**
409      * Pushes a new vocabulary definition.
410      *
411      * @param currentNode node proving the vocabulary.
412      * @param vocab the vocabulary URI.
413      */
414     private void pushVocabulary(Node currentNode, URI vocab) {
415         vocabularyStack.push( new Vocabulary(currentNode, vocab) );
416     }
417 
418     /**
419      * @return the current peek vocabulary.
420      */
421     private URI getVocabulary() {
422         if(vocabularyStack.isEmpty()) return null;
423         return vocabularyStack.peek().prefix;
424     }
425 
426     /**
427      * Pops out the vocabulary definition.
428      *
429      * @param current
430      */
431     private void popVocabulary(Node current) {
432         if(vocabularyStack.isEmpty()) return;
433         if(DomUtils.isAncestorOf(current, vocabularyStack.peek().originatingNode)) {
434             vocabularyStack.pop();
435         }
436     }
437 
438     /**
439      * Purge all incomplete triples originated from a node that is descendant of <code>current</code>.
440      *
441      * @param current
442      */
443     private void purgeIncompleteTriples(Node current) {
444         final List<IncompleteTriple> toBePurged = new ArrayList<IncompleteTriple>();
445         for(IncompleteTriple incompleteTriple : listOfIncompleteTriples) {
446             if( DomUtils.isAncestorOf(current, incompleteTriple.originatingNode, true) ) {
447                 toBePurged.add(incompleteTriple);
448             }
449         }
450         listOfIncompleteTriples.removeAll(toBePurged);
451         toBePurged.clear();
452     }
453 
454     /**
455      * Reports an error to the error reporter.
456      *
457      * @param n originating node.
458      * @param msg human readable message.
459      */
460     private void reportError(Node n, String msg) {
461         final String errorMsg = String.format(
462                 "Error while processing node [%s] : '%s'",
463                 DomUtils.getXPathForNode(n), msg
464         );
465         final int[] errorLocation = DomUtils.getNodeLocation(n);
466         this.issueReport.notifyIssue(
467                 IssueReport.IssueLevel.Warning,
468                 errorMsg,
469                 errorLocation == null ? -1 : errorLocation[0],
470                 errorLocation == null ? -1 : errorLocation[1]
471         );
472     }
473 
474     /**
475      * Performs a <i>deep-first</i> tree visit on the given root node.
476      *
477      * @param node root node.
478      * @param extractionResult
479      */
480     private void depthFirstNode(Node node, ExtractionResult extractionResult) {
481         try {
482             processNode(node, extractionResult);
483         } catch (Exception e) {
484             if(logger.isDebugEnabled()) logger.debug("Error while processing node.", e);
485             reportError(node, e.getMessage());
486             // e.printStackTrace();
487         }
488         depthFirstChildren(node.getChildNodes(), extractionResult);
489         purgeIncompleteTriples(node);
490     }
491 
492     /**
493      * Performs a <i>deep-first</i> children list visit.
494      *
495      * @param nodeList
496      * @param extractionResult
497      */
498     private void depthFirstChildren(NodeList nodeList, ExtractionResult extractionResult) {
499         for(int i = 0; i < nodeList.getLength(); i++) {
500             final Node child = nodeList.item(i);
501             depthFirstNode(child, extractionResult);
502             popMappings(child);
503             popVocabulary(child);
504             popContext(child);
505         }
506     }
507 
508     /**
509      * Writes a triple on the extraction result.
510      *
511      * @param s
512      * @param p
513      * @param o
514      * @param extractionResult
515      */
516     private void writeTriple(Resource s, URI p, Value o, ExtractionResult extractionResult) {
517         // if(logger.isTraceEnabled()) logger.trace(String.format("writeTriple(%s %s %s)" , s, p, o));
518         assert s != null : "subject   is null.";
519         assert p != null : "predicate is null.";
520         assert o != null : "object    is null.";
521         extractionResult.writeTriple(s, p, o);
522     }
523 
524     /**
525      * Processes the current node on the extraction algorithm.
526      * All the steps of this algorithm are annotated with the
527      * specification and section which describes it. The annotation is at form
528      * <em>RDFa&lt;spec-version%gt;[&lt;section&gt;]</em>
529      *
530      * @param currentElement
531      * @param extractionResult
532      * @throws Exception
533      */
534     // TODO: add references to the RDFa 1.1 algorithm.
535     private void processNode(Node currentElement, ExtractionResult extractionResult) throws Exception {
536         // if(logger.isTraceEnabled()) logger.trace("processNode(" + DomUtils.getXPathForNode(currentElement) + ")");
537         final EvaluationContext currentEvaluationContext = getContext();
538         try {
539             if(
540                 currentElement.getNodeType() != Node.DOCUMENT_NODE
541                 &&
542                 currentElement.getNodeType() != Node.ELEMENT_NODE
543             ) return;
544 
545             // RDFa1.1[7.5.3]
546             updateVocabulary(currentElement);
547 
548             // RDFa1.0[5.5.2] / RDFa1.1[7.5.4]
549             //Node currentElement = node;
550             updateURIMapping(currentElement);
551 
552             // RDFa1.0[5.5.3] / RDFa1.1[7.5.5]
553             updateLanguage(currentElement, currentEvaluationContext);
554 
555             if(! isRelativeNode(currentElement)) {
556                 // RDFa1.0[5.5.4] / RDFa1.1[7.5.6]
557                 establishNewSubject(currentElement, currentEvaluationContext);
558             } else {
559                 // RDFa1.0[5.5.5] / RDFa1.1[7.5.7]
560                 establishNewSubjectCurrentObjectResource(
561                         currentElement,
562                         currentEvaluationContext
563                 );
564             }
565 
566             /*
567             if(currentEvaluationContext.newSubject == null) {
568                 currentEvaluationContext.newSubject = resolveURI(documentBase.toExternalForm());
569             }
570             assert currentEvaluationContext.newSubject != null : "newSubject must be not null.";
571             */
572             if(currentEvaluationContext.newSubject == null) return;
573             if(logger.isDebugEnabled()) logger.debug("newSubject: " + currentEvaluationContext.newSubject);
574 
575             // RDFa1.0[5.5.6] / RDFa1.1[7.5.8]
576             final URI[] types = getTypes(currentElement);
577             for(URI type : types) {
578                 writeTriple(currentEvaluationContext.newSubject, RDF.TYPE, type, extractionResult);
579             }
580 
581             // RDFa1.0[5.5.7] / RDFa1.1[7.5.9]
582             final URI[] rels = getRels(currentElement);
583             final URI[] revs = getRevs(currentElement);
584             if(currentEvaluationContext.currentObjectResource != null) {
585                 for (URI rel : rels) {
586                     writeTriple(
587                             currentEvaluationContext.newSubject,
588                             rel,
589                             currentEvaluationContext.currentObjectResource,
590                             extractionResult
591                     );
592                 }
593                 for (URI rev : revs) {
594                     writeTriple(
595                             currentEvaluationContext.currentObjectResource,
596                             rev,
597                             currentEvaluationContext.newSubject, extractionResult
598                     );
599                 }
600             } else { // RDFa1.0[5.5.8] / RDFa1.1[7.5.10]
601                 for(URI rel : rels) {
602                     listOfIncompleteTriples.add(
603                             new IncompleteTriple(
604                                     currentElement,
605                                     currentEvaluationContext.newSubject,
606                                     rel,
607                                     IncompleteTripleDirection.Forward
608                             )
609                     );
610                 }
611                 for(URI rev : revs) {
612                     listOfIncompleteTriples.add(
613                             new IncompleteTriple(
614                                     currentElement,
615                                     currentEvaluationContext.newSubject,
616                                     rev,
617                                     IncompleteTripleDirection.Reverse
618                             )
619                     );
620                 }
621             }
622 
623             // RDFa1.0[5.5.9] / RDFa1.1[7.5.11]
624             final Value currentObject = getCurrentObject(currentElement);
625             final URI[] predicates = getPredicate(currentElement);
626             if (currentObject != null && predicates != null) {
627                 for (URI predicate : predicates) {
628                     writeTriple(currentEvaluationContext.newSubject, predicate, currentObject, extractionResult);
629                 }
630             }
631 
632             // RDFa1.0[5.5.10] / RDFa1.1[7.5.12]
633             if(!currentEvaluationContext.skipElem && currentEvaluationContext.newSubject != null) {
634                 for (IncompleteTriple incompleteTriple : listOfIncompleteTriples) {
635                     incompleteTriple.produceTriple(
636                             currentElement,
637                             currentEvaluationContext.newSubject,
638                             extractionResult
639                     );
640                 }
641             }
642         } catch (Exception e) {
643             throw e;
644         } finally {
645             // RDFa1.0[5.5.11] / RDFa1.1[7.5.13]
646             if(currentEvaluationContext.recourse) {
647                 EvaluationContext newEvaluationContext = new EvaluationContext(currentEvaluationContext.base);
648                 if(currentEvaluationContext.skipElem) {
649                     newEvaluationContext.language = currentEvaluationContext.language;
650                 } else {
651                     newEvaluationContext.base = currentEvaluationContext.base;
652 
653                     if(currentEvaluationContext.newSubject != null) {
654                         newEvaluationContext.parentSubject = currentEvaluationContext.newSubject;
655                     } else {
656                         newEvaluationContext.parentSubject = currentEvaluationContext.parentSubject;
657                     }
658 
659                     if(currentEvaluationContext.currentObjectResource != null) {
660                         newEvaluationContext.parentObject = currentEvaluationContext.currentObjectResource;
661                     } else if(currentEvaluationContext.newSubject != null) {
662                         newEvaluationContext.parentObject = currentEvaluationContext.newSubject;
663                     } else {
664                         newEvaluationContext.parentObject = currentEvaluationContext.parentSubject;
665                     }
666 
667                     newEvaluationContext.language = currentEvaluationContext.language;
668                 }
669                 pushContext(currentElement, newEvaluationContext);
670             }
671         }
672     }
673 
674     /**
675      * Extract URI namespaces (prefixes) from the current node.
676      *
677      * @param node
678      * @param prefixMapList
679      */
680     private void extractPrefixes(Node node, List<PrefixMap> prefixMapList) {
681         final String prefixAttribute = DomUtils.readAttribute(node, PREFIX_ATTRIBUTE, null);
682         if(prefixAttribute == null) return;
683         final String[] prefixParts = extractPrefixSections(prefixAttribute);
684         for(String prefixPart : prefixParts) {
685             int splitPoint = prefixPart.indexOf(URI_PREFIX_SEPARATOR);
686             final String prefix = prefixPart.substring(0, splitPoint);
687             if(prefix.length() == 0) {
688                 reportError(node, String.format("Invalid prefix length in prefix attribute '%s'", prefixAttribute));
689                 continue;
690             }
691             final URI uri;
692             final String uriStr = prefixPart.substring(splitPoint + 1);
693             try {
694                 uri = resolveURI(uriStr);
695             } catch (Exception e) {
696                 reportError(
697                         node,
698                         String.format(
699                                 "Resolution of prefix '%s' defines an invalid URI: '%s'",
700                                 prefixAttribute, uriStr
701                         )
702                 );
703                 continue;
704             }
705             prefixMapList.add( new PrefixMap(prefix, uri) );
706         }
707     }
708 
709     /**
710      * Updates the current language.
711      *
712      * @param node
713      * @param currentEvaluationContext
714      */
715     private void updateLanguage(Node node, EvaluationContext currentEvaluationContext) {
716         final String candidateLanguage = DomUtils.readAttribute(node, XML_LANG_ATTRIBUTE, null);
717         if(candidateLanguage != null) currentEvaluationContext.language = candidateLanguage;
718     }
719 
720     /**
721      * Establish the new subject for the current recursion.
722      * See <i>RDFa 1.0 Specification section 5.5.4</i>, <i>RDFa 1.1 Specification section 7.5.6</i>.
723      *
724      * @param node
725      * @param currentEvaluationContext
726      * @throws URISyntaxException
727      */
728     private void establishNewSubject(Node node, EvaluationContext currentEvaluationContext)
729     throws URISyntaxException {
730         String candidateURIOrCURIE;
731         for(String subjectAttribute : SUBJECT_ATTRIBUTES) {
732             candidateURIOrCURIE = DomUtils.readAttribute(node, subjectAttribute, null);
733             if(candidateURIOrCURIE != null) {
734                 currentEvaluationContext.newSubject = resolveCURIEOrURI(candidateURIOrCURIE, false);
735                 return;
736             }
737         }
738 
739         if(node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) {
740             currentEvaluationContext.newSubject = resolveURI(currentEvaluationContext.base.toString());
741             return;
742         }
743 
744         if(DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) {
745             currentEvaluationContext.newSubject = RDFUtils.bnode();
746             return;
747         }
748 
749         if(DomUtils.hasAttribute(node, PROPERTY_ATTRIBUTE)) {
750             currentEvaluationContext.skipElem = true;
751         }
752         if(currentEvaluationContext.parentObject != null) {
753             currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject;
754             return;
755         }
756 
757         currentEvaluationContext.newSubject = null;
758     }
759 
760     /**
761      * Establishes the new subject and the current object resource.
762      *
763      * See <i>RDFa 1.0 Specification section 5.5.5</i>, <i>RDFa 1.1 Specification section 7.5.7</i>.
764      *
765      * @param node
766      * @param currentEvaluationContext
767      * @throws URISyntaxException
768      */
769     private void establishNewSubjectCurrentObjectResource(Node node, EvaluationContext currentEvaluationContext)
770     throws URISyntaxException {
771         // Subject.
772         String candidateURIOrCURIE;
773         candidateURIOrCURIE = DomUtils.readAttribute(node, ABOUT_ATTRIBUTE, null);
774         if(candidateURIOrCURIE != null) {
775             currentEvaluationContext.newSubject = resolveCURIEOrURI(candidateURIOrCURIE, false);
776         } else {
777             candidateURIOrCURIE = DomUtils.readAttribute(node, SRC_ATTRIBUTE, null);
778             if (candidateURIOrCURIE != null) {
779                 currentEvaluationContext.newSubject = resolveURI(candidateURIOrCURIE);
780             } else {
781                 if (node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) {
782                     currentEvaluationContext.newSubject = resolveURI(currentEvaluationContext.base.toString());
783                 } else {
784                     if (DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) {
785                         currentEvaluationContext.newSubject = RDFUtils.bnode();
786                     } else {
787                         if (currentEvaluationContext.parentObject != null) {
788                             currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject;
789                         }
790                     }
791                 }
792             }
793         }
794 
795         // Object.
796         candidateURIOrCURIE = DomUtils.readAttribute(node, RESOURCE_ATTRIBUTE, null);
797         if(candidateURIOrCURIE != null) {
798             currentEvaluationContext.currentObjectResource = resolveCURIEOrURI(candidateURIOrCURIE, false);
799             return;
800         }
801 
802         candidateURIOrCURIE = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null);
803         if(candidateURIOrCURIE != null) {
804             currentEvaluationContext.currentObjectResource = resolveURI(candidateURIOrCURIE);
805             return;
806         }
807         currentEvaluationContext.currentObjectResource = null;
808     }
809 
810     private URI[] getTypes(Node node) throws URISyntaxException {
811         final String typeOf = DomUtils.readAttribute(node, TYPEOF_ATTRIBUTE, null);
812         return resolveCurieOrURIList(node, typeOf, true);
813     }
814 
815     private URI[] getRels(Node node) throws URISyntaxException {
816         final String rel = DomUtils.readAttribute(node, REL_ATTRIBUTE, null);
817         return resolveCurieOrURIList(node, rel, true);
818     }
819 
820     private URI[] getRevs(Node node) throws URISyntaxException {
821         final String rev = DomUtils.readAttribute(node, REV_ATTRIBUTE, null);
822         return resolveCurieOrURIList(node, rev, true);
823     }
824 
825     private URI[] getPredicate(Node node) throws URISyntaxException {
826         final String candidateURI = DomUtils.readAttribute(node, PROPERTY_ATTRIBUTE, null);
827         if(candidateURI == null) return null;
828         return resolveCurieOrURIList(node, candidateURI, true);
829     }
830 
831     /**
832      * Establishes the new object value.
833      * See <i>RDFa 1.0 Specification section 5.5.9</i>, <i>RDFa 1.1 Specification section 7.5.11</i>.
834      *
835      * @param node
836      * @return
837      * @throws URISyntaxException
838      * @throws IOException
839      * @throws TransformerException
840      */
841     private Value getCurrentObject(Node node)
842     throws URISyntaxException, IOException, TransformerException {
843         final String candidateObject = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null);
844         if(candidateObject != null) {
845             return resolveURI(candidateObject);
846         } else {
847             return gerCurrentObjectLiteral(node);
848         }
849     }
850 
851     private Literal gerCurrentObjectLiteral(Node node)
852     throws URISyntaxException, IOException, TransformerException {
853         final EvaluationContext currentEvaluationContext = getContext();
854         Literal literal;
855 
856         literal = getAsTypedLiteral(node);
857         if(literal != null) return literal;
858 
859         literal = getAsXMLLiteral(node);
860         if(literal != null) {
861             currentEvaluationContext.recourse = false;
862             return literal;
863         }
864 
865         literal = getAsPlainLiteral(node, currentEvaluationContext.language);
866         if(literal != null) return literal;
867 
868         return null;
869     }
870 
871     private static String getNodeContent(Node node) {
872         final String candidateContent = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null);
873         if(candidateContent != null) return candidateContent;
874         return node.getTextContent();
875     }
876 
877     /**
878      * Extracts the current typed literal from the given node.
879      * See <i>RDFa 1.0 Specification section 5.5.9.1</i>.
880      *
881      * @param node
882      * @return
883      * @throws URISyntaxException
884      */
885     private Literal getAsTypedLiteral(Node node) throws URISyntaxException {
886         final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null);
887         if (datatype == null || datatype.trim().length() == 0 || XML_LITERAL_DATATYPE.equals(datatype.trim()) ) {
888             return null;
889         }
890         final Resource curieOrURI = resolveCURIEOrURI(datatype, true);
891         return RDFUtils.literal(getNodeContent(node), curieOrURI instanceof URI ? (URI) curieOrURI : null);
892     }
893 
894     private void pushMappings(Node sourceNode, List<PrefixMap> prefixMapList) {
895         // logger.trace("pushMappings()");
896 
897         final Map<String, URI> mapping = new HashMap<String, URI>();
898         for (PrefixMap prefixMap : prefixMapList) {
899             mapping.put(prefixMap.prefix, prefixMap.uri);
900         }
901         uriMappingStack.push( new URIMapping(sourceNode, mapping) );
902     }
903 
904     private void popMappings(Node node) {
905         if(uriMappingStack.isEmpty()) return;
906         final URIMapping peek = uriMappingStack.peek();
907         if( ! DomUtils.isAncestorOf(peek.sourceNode, node) ) {
908             // logger.trace("popMappings()");
909             uriMappingStack.pop();
910         }
911     }
912 
913     /**
914      * Resolve a namespaced URI, if <code>safe</code> is <code>true</code>
915      * then the mapping must define a prefix, otherwise it is considered relative.
916      *
917      * @param mapping
918      * @param resolutionPolicy
919      * @return
920      */
921     private Resource resolveNamespacedURI(String mapping, ResolutionPolicy resolutionPolicy) {
922         if(mapping.indexOf(URI_PATH_SEPARATOR) == 0) { // Begins with '/'
923             mapping = mapping.substring(1);
924         }
925 
926         final int prefixSeparatorIndex = mapping.indexOf(':');
927         if(prefixSeparatorIndex == -1) { // there is no prefix separator.
928             if(resolutionPolicy == ResolutionPolicy.NSRequired) {
929                 throw new IllegalArgumentException(
930                         String.format("Invalid mapping string [%s], must declare a prefix.", mapping)
931                 );
932             }
933             if (resolutionPolicy == ResolutionPolicy.TermAllowed) {
934                 final URI currentVocabulary = getVocabulary();
935                 // Mapping is a TERM.
936                 if (currentVocabulary != null) {
937                     return resolveURI(currentVocabulary.toString() + mapping);
938                 }
939             }
940             return resolveURI(documentBase.toString() + mapping);
941         }
942 
943         final String prefix = mapping.substring(0, prefixSeparatorIndex);
944         final URI curieMapping = getMapping(prefix);
945         if(curieMapping == null) {
946             throw new IllegalArgumentException( String.format("Cannot map prefix '%s'", prefix) );
947         }
948         final String candidateCURIEStr = curieMapping.toString() + mapping.substring(prefixSeparatorIndex + 1);
949         final java.net.URI candidateCURIE;
950         try {
951             candidateCURIE = new java.net.URI(candidateCURIEStr);
952         } catch (URISyntaxException urise) {
953             throw new IllegalArgumentException(String.format("Invalid CURIE '%s'", candidateCURIEStr) );
954         }
955         return resolveURI(
956                 candidateCURIE.isAbsolute()
957                         ?
958                         candidateCURIE.toString()
959                         :
960                         documentBase.toString() + candidateCURIE.toString()
961         );
962     }
963 
964     /**
965      * The resolution policy provided to the method {@link #resolveNamespacedURI(String, ResolutionPolicy)}.
966      */
967     enum ResolutionPolicy {
968         NSNotRequired,
969         NSRequired,
970         TermAllowed
971     }
972 
973     /**
974      * Defines an evaluation context.
975      */
976     private class EvaluationContext {
977         private Node node;
978         private URL base;
979         private Resource parentSubject;
980         private Value parentObject;
981         private String language;
982         private boolean recourse;
983         private boolean skipElem;
984         private Resource newSubject;
985         private Resource currentObjectResource;
986 
987         /**
988          * Sections <em>RDFa1.0[5.5]</em>, <em>RDFa1.0[5.5.1]</em>, <em>RDFa1.1[7.5.1]</em> .
989          *
990          * @param base
991          */
992         EvaluationContext(URL base) {
993             this.base             = base;
994             this.parentSubject    = resolveURI( base.toExternalForm() );
995             this.parentObject     = null;
996             this.language         = null;
997             this.recourse         = true;
998             this.skipElem         = false;
999             this.newSubject       = null;
1000             this.currentObjectResource = null;
1001         }
1002     }
1003 
1004     /**
1005      * Defines a prefix mapping.
1006      */
1007     private class PrefixMap {
1008         final String prefix;
1009         final URI    uri;
1010         public PrefixMap(String prefix, URI uri) {
1011             this.prefix = prefix;
1012             this.uri = uri;
1013         }
1014     }
1015 
1016     /**
1017      * Defines a URI mapping.
1018      */
1019     private class URIMapping {
1020         final Node sourceNode;
1021         final Map<String, URI> map;
1022 
1023         public URIMapping(Node sourceNode, Map<String, URI> map) {
1024             this.sourceNode = sourceNode;
1025             this.map        = map;
1026         }
1027     }
1028 
1029     /**
1030      * Defines the direction of an {@link IncompleteTriple}.
1031      */
1032     private enum IncompleteTripleDirection {
1033         Forward,
1034         Reverse
1035     }
1036 
1037     /**
1038      * Defines an incomplete triple.
1039      */
1040     private class IncompleteTriple {
1041         final Node     originatingNode;
1042         final Resource subject;
1043         final URI      predicate;
1044         final IncompleteTripleDirection direction;
1045 
1046         public IncompleteTriple(
1047                 Node originatingNode,
1048                 Resource subject,
1049                 URI predicate,
1050                 IncompleteTripleDirection direction
1051         ) {
1052             if(originatingNode == null || subject == null || predicate == null || direction == null)
1053                 throw new IllegalArgumentException();
1054 
1055             this.originatingNode = originatingNode;
1056             this.subject         = subject;
1057             this.predicate       = predicate;
1058             this.direction       = direction;
1059         }
1060 
1061         public boolean produceTriple(Node resourceNode, Resource r, ExtractionResult extractionResult) {
1062             if( ! DomUtils.isAncestorOf(originatingNode, resourceNode, true) ) return false;
1063 
1064             if(r == null) throw new IllegalArgumentException();
1065             switch (direction) {
1066                 case Forward:
1067                     extractionResult.writeTriple(subject, predicate, r);
1068                     break;
1069                 case Reverse:
1070                     extractionResult.writeTriple(r, predicate, subject);
1071                     break;
1072                 default:
1073                     throw new IllegalStateException();
1074             }
1075             return true;
1076         }
1077     }
1078 
1079     /**
1080      * Defines a vocabulary object.
1081      */
1082     private class Vocabulary {
1083         final Node originatingNode;
1084         final URI prefix;
1085 
1086         public Vocabulary(Node originatingNode, URI prefix) {
1087             this.originatingNode = originatingNode;
1088             this.prefix = prefix;
1089         }
1090     }
1091 
1092 }