View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdfa;
19  
20  import java.io.IOException;
21  import java.net.MalformedURLException;
22  import java.net.URISyntaxException;
23  import java.net.URL;
24  import java.util.ArrayList;
25  import java.util.HashMap;
26  import java.util.List;
27  import java.util.Map;
28  import java.util.Stack;
29  import javax.xml.transform.TransformerException;
30  import org.apache.any23.extractor.ExtractionResult;
31  import org.apache.any23.extractor.IssueReport;
32  import org.apache.any23.extractor.html.DomUtils;
33  import org.apache.any23.rdf.RDFUtils;
34  import org.eclipse.rdf4j.model.IRI;
35  import org.eclipse.rdf4j.model.Literal;
36  import org.eclipse.rdf4j.model.Resource;
37  import org.eclipse.rdf4j.model.Value;
38  import org.eclipse.rdf4j.model.vocabulary.RDF;
39  import org.slf4j.Logger;
40  import org.slf4j.LoggerFactory;
41  import org.w3c.dom.Document;
42  import org.w3c.dom.NamedNodeMap;
43  import org.w3c.dom.Node;
44  import org.w3c.dom.NodeList;
45  
46  /**
47   * This parser is able to extract <a href="http://www.w3.org/TR/rdfa-syntax/">RDFa 1.0</a> and
48   * <a href="http://www.w3.org/TR/rdfa-core/">RDFa 1.1</a> statements from any <i>(X)HTML</i> document.
49   * @deprecated since 2.3 the {@link org.eclipse.rdf4j.rio.Rio} implementations 
50   * are used to parse RDFa. Look at {@link org.apache.any23.extractor.rdf.RDFParserFactory#getRDFa10Parser}
51   * and {@link org.apache.any23.extractor.rdf.RDFParserFactory#getRDFa11Parser}.
52   * 
53   * @author Michele Mostarda (mostarda@fbk.eu)
54   */
55  @Deprecated
56  public class RDFa11Parser {
57  
58      private static final Logger logger = LoggerFactory.getLogger(RDFa11Parser.class);
59  
60      public static final String CURIE_SEPARATOR      = ":";
61      public static final char   IRI_PREFIX_SEPARATOR = ':';
62      public static final String IRI_SCHEMA_SEPARATOR = "://";
63      public static final String IRI_PATH_SEPARATOR   = "/";
64  
65      public static final String HEAD_TAG = "HEAD";
66      public static final String BODY_TAG = "BODY";
67  
68      public static final String XMLNS_ATTRIBUTE    = "xmlns";
69      public static final String XML_LANG_ATTRIBUTE = "xml:lang";
70  
71      public static final String REL_ATTRIBUTE      = "rel";
72      public static final String REV_ATTRIBUTE      = "rev";
73  
74      public static final String ABOUT_ATTRIBUTE    = "about";
75      public static final String RESOURCE_ATTRIBUTE = "resource";
76      public static final String SRC_ATTRIBUTE      = "src";
77      public static final String HREF_ATTRIBUTE     = "href";
78  
79      public static final String TYPE_ATTRIBUTE     = "type";
80      public static final String ATTRIBUTE_CSS      = "text/css";
81  
82      public static final String[] SUBJECT_ATTRIBUTES = {
83              ABOUT_ATTRIBUTE,
84              SRC_ATTRIBUTE,
85              RESOURCE_ATTRIBUTE,
86              HREF_ATTRIBUTE
87      };
88  
89      public static final String PREFIX_ATTRIBUTE   = "prefix";
90      public static final String TYPEOF_ATTRIBUTE   = "typeof";
91      public static final String PROPERTY_ATTRIBUTE = "property";
92      public static final String DATATYPE_ATTRIBUTE = "datatype";
93      public static final String CONTENT_ATTRIBUTE  = "content";
94      public static final String VOCAB_ATTRIBUTE    = "vocab";
95      // TODO: introduce support for RDFa profiles. (http://www.w3.org/TR/rdfa-core/#s_profiles)
96      public static final String PROFILE_ATTRIBUTE  = "profile";
97  
98      public static final String XML_LITERAL_DATATYPE = "rdf:XMLLiteral";
99  
100     public static final String XMLNS_DEFAULT = "http://www.w3.org/1999/xhtml";
101 
102     private IssueReport issueReport;
103 
104     private URL documentBase;
105 
106     private final Stack<IRIMapping> IRIMappingStack = new Stack<>();
107 
108     private final Stack<Vocabulary> vocabularyStack = new Stack<>();
109 
110     private final List<IncompleteTriple> listOfIncompleteTriples = new ArrayList<>();
111 
112     private final Stack<EvaluationContext> evaluationContextStack = new Stack<>();
113 
114     public RDFa11Parser() {
115         //default constructor
116     }
117 
118     protected static URL getDocumentBase(URL documentURL, Document document) throws MalformedURLException {
119         String base;
120         base = DomUtils.find(document, "/HTML/HEAD/BASE/@href");                  // Non XHTML documents.
121         if( ! "".equals(base) )
122             return new URL(base);
123         base = DomUtils.find(document, "//*/h:head/h:base[position()=1]/@href");  // XHTML documents.
124         if( ! "".equals(base) )
125             return new URL(base);
126         return documentURL;
127     }
128 
129     /**
130      * Given a prefix declaration returns a list of <code>prefixID:prefixURL</code> strings
131      * normalizing blanks where present.
132      *
133      * @param prefixesDeclaration input prefix
134      * @return list of extracted prefixes.
135      */
136     protected static String[] extractPrefixSections(String prefixesDeclaration) {
137         final String[] parts = prefixesDeclaration.split("\\s");
138         final List<String> out = new ArrayList<>();
139         int i = 0;
140         while(i < parts.length) {
141             final String part = parts[i];
142             if(part.length() == 0) {
143                 i++;
144                 continue;
145             }
146             if(part.charAt( part.length() -1 ) == IRI_PREFIX_SEPARATOR) {
147                 i++;
148                 while(i < parts.length && parts[i].length() == 0)
149                     i++;
150                 out.add( part + (i < parts.length ? parts[i] : "") );
151                 i++;
152             } else {
153                 out.add(parts[i]);
154                 i++;
155             }
156         }
157         return out.toArray( new String[out.size()] );
158     }
159 
160     protected static boolean isAbsoluteIRI(String iri) {
161         return iri.contains(IRI_SCHEMA_SEPARATOR);
162     }
163 
164     protected static boolean isCURIE(String curie) {
165         if(curie == null) {
166             throw new NullPointerException("curie string cannot be null.");
167         }
168         if(curie.trim().length() == 0)
169             return false;
170 
171         // '[' PREFIX ':' VALUE ']'
172         if( curie.charAt(0) != '[' || curie.charAt(curie.length() -1) != ']')
173             return false;
174         int separatorIndex = curie.indexOf(CURIE_SEPARATOR);
175         return separatorIndex > 0 && curie.indexOf(CURIE_SEPARATOR, separatorIndex + 1) == -1;
176     }
177 
178     protected static boolean isCURIEBNode(String curie) {
179         return isCURIE(curie) && curie.substring(1, curie.length() -1).split(CURIE_SEPARATOR)[0].equals("_");
180     }
181 
182     protected static boolean isRelativeNode(Node node) {
183         if( ATTRIBUTE_CSS.equals( DomUtils.readAttribute(node, TYPE_ATTRIBUTE) ) )
184             return false;
185         return DomUtils.hasAttribute(node, REL_ATTRIBUTE) || DomUtils.hasAttribute(node, REV_ATTRIBUTE);
186     }
187 
188     // RDFa1.0[5.5.9.2]
189     protected static Literal getAsPlainLiteral(Node node, String currentLanguage) {
190         final String content = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null);
191         if(content != null)
192             return RDFUtils.literal(content, currentLanguage);
193 
194         if(! node.hasChildNodes() )
195             return RDFUtils.literal("", currentLanguage);
196 
197         final String nodeTextContent = node.getTextContent();
198         return nodeTextContent == null ? null : RDFUtils.literal(nodeTextContent.trim(), currentLanguage);
199     }
200 
201     protected static Literal getAsXMLLiteral(Node node) throws IOException, TransformerException {
202         final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null);
203         if(! XML_LITERAL_DATATYPE.equals(datatype))
204             return null;
205 
206         final String xmlSerializedNode = DomUtils.serializeToXML(node, false);
207         return RDFUtils.literal(xmlSerializedNode, RDF.XMLLITERAL);
208     }
209 
210     protected static boolean isXMLNSDeclared(Document document) {
211         final String attributeValue = document.getDocumentElement().getAttribute(XMLNS_ATTRIBUTE);
212         if(attributeValue.length() == 0)
213             return false;
214         return XMLNS_DEFAULT.equals(attributeValue);
215     }
216 
217     /**
218      * <a href="http://www.w3.org/TR/rdfa-syntax/#s_model">RDFa Syntax - Processing Model</a>.
219      *
220      * @param documentURL {@link java.net.URL} of the document to process
221      * @param extractionResult a {@link org.apache.any23.extractor.ExtractionResult} to populate
222      * @param document the {@link org.w3c.dom.Document} to populate with parse content
223      * @throws RDFa11ParserException if there is an error parsing the document
224      */
225     public void processDocument(URL documentURL, Document document, ExtractionResult extractionResult)
226     throws RDFa11ParserException {
227         try {
228             this.issueReport = extractionResult;
229 
230             // Check RDFa1.0[4.1.3] : default XMLNS declaration.
231             if( ! isXMLNSDeclared(document)) {
232                 reportError(
233                         document.getDocumentElement(),
234                         String.format(
235                                 "The default %s namespace is expected to be declared and equal to '%s' .",
236                                 XMLNS_ATTRIBUTE, XMLNS_DEFAULT
237                         )
238                 );
239             }
240 
241             try {
242                 documentBase = getDocumentBase(documentURL, document);
243             } catch (MalformedURLException murle) {
244                 throw new RDFa11ParserException("Invalid document base URL.", murle);
245             }
246 
247             // RDFa1.0[5.5.1]
248             pushContext(document, new EvaluationContext(documentBase));
249 
250             depthFirstNode(document, extractionResult);
251 
252             assert listOfIncompleteTriples.isEmpty()
253                     :
254                    "The list of incomplete triples is expected to be empty at the end of processing.";
255         } finally {
256             reset();
257         }
258     }
259 
260     /**
261      * Resets the parser to the original state.
262      */
263     public void reset() {
264         issueReport = null;
265         documentBase  = null;
266         IRIMappingStack.clear();
267         listOfIncompleteTriples.clear();
268         evaluationContextStack.clear();
269     }
270 
271     /**
272      * Updates the vocabulary context with possible <em>@vocab</em> declarations.
273      *
274      * @param currentNode the current node.
275      */
276     protected void updateVocabulary(Node currentNode) {
277         final String vocabularyStr = DomUtils.readAttribute(currentNode, VOCAB_ATTRIBUTE, null);
278         if(vocabularyStr == null)
279             return;
280         try {
281             pushVocabulary(currentNode, RDFUtils.iri(vocabularyStr));
282         } catch (Exception e) {
283             reportError(currentNode, String.format("Invalid vocabulary [%s], must be a IRI.", vocabularyStr));
284         }
285     }
286 
287     /**
288      * Updates the IRI mapping with the XMLNS attributes declared in the current node.
289      *
290      * @param node input node.
291      */
292     protected void updateIRIMapping(Node node) {
293         final NamedNodeMap attributes = node.getAttributes();
294         if (null == attributes)
295             return;
296 
297         Node attribute;
298         final List<PrefixMap> prefixMapList = new ArrayList<PrefixMap>();
299         final String namespacePrefix = XMLNS_ATTRIBUTE + IRI_PREFIX_SEPARATOR;
300         for (int a = 0; a < attributes.getLength(); a++) {
301             attribute = attributes.item(a);
302             if (attribute.getNodeName().startsWith(namespacePrefix)) {
303                 prefixMapList.add(
304                         new PrefixMap(
305                             attribute.getNodeName().substring(namespacePrefix.length()),
306                             resolveIRI(attribute.getNodeValue())
307                         )
308                 );
309             }
310         }
311 
312         extractPrefixes(node, prefixMapList);
313 
314         if(prefixMapList.size() == 0)
315             return;
316         pushMappings(
317                 node,
318                 prefixMapList
319         );
320     }
321 
322     /**
323      * Returns a IRI mapping for a given prefix.
324      *
325      * @param prefix input prefix.
326      * @return IRI mapping.
327      */
328     protected IRI getMapping(String prefix) {
329         for (IRIMapping IRIMapping : IRIMappingStack) {
330             final IRI mapping = IRIMapping.map.get(prefix);
331             if (mapping != null) {
332                 return mapping;
333             }
334         }
335         return null;
336     }
337 
338     /**
339      * Resolves a <em>whitelist</em> separated list of <i>CURIE</i> or <i>URI</i>.
340      *
341      * @param n current node.
342      * @param curieOrIRIList list of CURIE/URI.
343      * @param termAllowed determine whether the term should be whitelisted.
344      * @return list of resolved URIs.
345      * @throws URISyntaxException if there is an error processing CURIE or URL
346      */
347     protected IRI[] resolveCIRIeOrIRIList(Node n, String curieOrIRIList, boolean termAllowed)
348     throws URISyntaxException {
349         if(curieOrIRIList == null || curieOrIRIList.trim().length() == 0)
350             return new IRI[0];
351 
352         final String[] curieOrIRIListParts = curieOrIRIList.split("\\s");
353         final List<IRI> result = new ArrayList<>();
354         Resource curieOrIRI;
355         for(String curieORIRIListPart : curieOrIRIListParts) {
356             curieOrIRI = resolveCURIEOrIRI(curieORIRIListPart, termAllowed);
357             if(curieOrIRI != null && curieOrIRI instanceof IRI) {
358                 result.add((IRI) curieOrIRI);
359             } else {
360                 reportError(n, String.format("Invalid CURIE '%s' : expected IRI, found BNode.", curieORIRIListPart));
361             }
362         }
363         return result.toArray(new IRI[result.size()]);
364     }
365 
366     /**
367      * Resolves a IRI string as IRI.
368      *
369      * @param iriStr (partial) IRI string to be resolved.
370      * @return the resolved IRI.
371      */
372     protected IRI resolveIRI(String iriStr) {
373         return
374                 isAbsoluteIRI(iriStr)
375                         ?
376                 RDFUtils.iri(iriStr)
377                         :
378                 RDFUtils.iri( this.documentBase.toExternalForm(), iriStr );
379     }
380 
381     /**
382      * Resolves a <i>CURIE</i> or <i>IRI</i> string.
383      *
384      * @param curieOrIRI individual of CURIE/URI to resolve
385      * @param termAllowed if <code>true</code> the resolution can be a term.
386      * @return the resolved resource.
387      */
388     protected Resource resolveCURIEOrIRI(String curieOrIRI, boolean termAllowed) {
389         if( isCURIE(curieOrIRI) ) {
390             return resolveNamespacedIRI(curieOrIRI.substring(1, curieOrIRI.length() - 1), ResolutionPolicy.NSRequired);
391         }
392         if(isAbsoluteIRI(curieOrIRI))
393             return resolveIRI(curieOrIRI);
394         return resolveNamespacedIRI(
395                 curieOrIRI,
396                 termAllowed ? ResolutionPolicy.TermAllowed : ResolutionPolicy.NSNotRequired
397         );
398     }
399 
400     /**
401      * Pushes a context whiting the evaluation context stack, associated to tha given generation node.
402      *
403      * @param current
404      * @param ec
405      */
406     private void pushContext(Node current, EvaluationContext ec) {
407         ec.node = current;
408         evaluationContextStack.push(ec);
409     }
410 
411     /**
412      * @return the peek evaluation context.
413      */
414     private EvaluationContext getContext() {
415         return evaluationContextStack.peek();
416     }
417 
418     /**
419      * Pops out the peek evaluation context if ancestor of current node.
420      *
421      * @param current current node.
422      */
423     private void popContext(Node current) {
424         final Node peekNode = evaluationContextStack.peek().node;
425         if(DomUtils.isAncestorOf(peekNode, current)) {
426             evaluationContextStack.pop();
427         }
428     }
429 
430     /**
431      * Pushes a new vocabulary definition.
432      *
433      * @param currentNode node proving the vocabulary.
434      * @param vocab the vocabulary IRI.
435      */
436     private void pushVocabulary(Node currentNode, IRI vocab) {
437         vocabularyStack.push( new Vocabulary(currentNode, vocab) );
438     }
439 
440     /**
441      * @return the current peek vocabulary.
442      */
443     private IRI getVocabulary() {
444         if(vocabularyStack.isEmpty())
445             return null;
446         return vocabularyStack.peek().prefix;
447     }
448 
449     /**
450      * Pops out the vocabulary definition.
451      *
452      * @param current
453      */
454     private void popVocabulary(Node current) {
455         if(vocabularyStack.isEmpty())
456             return;
457         if(DomUtils.isAncestorOf(current, vocabularyStack.peek().originatingNode)) {
458             vocabularyStack.pop();
459         }
460     }
461 
462     /**
463      * Purge all incomplete triples originated from a node that is descendant of <code>current</code>.
464      *
465      * @param current
466      */
467     private void purgeIncompleteTriples(Node current) {
468         final List<IncompleteTriple> toBePurged = new ArrayList<>();
469         for(IncompleteTriple incompleteTriple : listOfIncompleteTriples) {
470             if( DomUtils.isAncestorOf(current, incompleteTriple.originatingNode, true) ) {
471                 toBePurged.add(incompleteTriple);
472             }
473         }
474         listOfIncompleteTriples.removeAll(toBePurged);
475         toBePurged.clear();
476     }
477 
478     /**
479      * Reports an error to the error reporter.
480      *
481      * @param n originating node.
482      * @param msg human readable message.
483      */
484     private void reportError(Node n, String msg) {
485         final String errorMsg = String.format(
486                 "Error while processing node [%s] : '%s'",
487                 DomUtils.getXPathForNode(n), msg
488         );
489         final int[] errorLocation = DomUtils.getNodeLocation(n);
490         this.issueReport.notifyIssue(
491                 IssueReport.IssueLevel.WARNING,
492                 errorMsg,
493                 errorLocation == null ? -1 : errorLocation[0],
494                 errorLocation == null ? -1 : errorLocation[1]
495         );
496     }
497 
498     /**
499      * Performs a <i>deep-first</i> tree visit on the given root node.
500      *
501      * @param node root node.
502      * @param extractionResult
503      */
504     private void depthFirstNode(Node node, ExtractionResult extractionResult) {
505         try {
506             processNode(node, extractionResult);
507         } catch (Exception e) {
508             if(logger.isDebugEnabled())
509                 logger.debug("Error while processing node.", e);
510             reportError(node, e.getMessage());
511         }
512         depthFirstChildren(node.getChildNodes(), extractionResult);
513         purgeIncompleteTriples(node);
514     }
515 
516     /**
517      * Performs a <i>deep-first</i> children list visit.
518      *
519      * @param nodeList
520      * @param extractionResult
521      */
522     private void depthFirstChildren(NodeList nodeList, ExtractionResult extractionResult) {
523         for(int i = 0; i < nodeList.getLength(); i++) {
524             final Node child = nodeList.item(i);
525             depthFirstNode(child, extractionResult);
526             popMappings(child);
527             popVocabulary(child);
528             popContext(child);
529         }
530     }
531 
532     /**
533      * Writes a triple on the extraction result.
534      *
535      * @param s
536      * @param p
537      * @param o
538      * @param extractionResult
539      */
540     private void writeTriple(Resource s, IRI p, Value o, ExtractionResult extractionResult) {
541         assert s != null : "subject   is null.";
542         assert p != null : "predicate is null.";
543         assert o != null : "object    is null.";
544         extractionResult.writeTriple(s, p, o);
545     }
546 
547     /**
548      * Processes the current node on the extraction algorithm.
549      * All the steps of this algorithm are annotated with the
550      * specification and section which describes it. The annotation is at form
551      * <em>RDFa&lt;spec-version%gt;[&lt;section&gt;]</em>
552      *
553      * @param currentElement
554      * @param extractionResult
555      * @throws Exception
556      */
557     // TODO: add references to the RDFa 1.1 algorithm.
558     private void processNode(Node currentElement, ExtractionResult extractionResult) throws Exception {
559         final EvaluationContext currentEvaluationContext = getContext();
560         try {
561             if(
562                 currentElement.getNodeType() != Node.DOCUMENT_NODE
563                 &&
564                 currentElement.getNodeType() != Node.ELEMENT_NODE
565             ) return;
566 
567             // RDFa1.1[7.5.3]
568             updateVocabulary(currentElement);
569 
570             // RDFa1.0[5.5.2] / RDFa1.1[7.5.4]
571             //Node currentElement = node;
572             updateIRIMapping(currentElement);
573 
574             // RDFa1.0[5.5.3] / RDFa1.1[7.5.5]
575             updateLanguage(currentElement, currentEvaluationContext);
576 
577             if(! isRelativeNode(currentElement)) {
578                 // RDFa1.0[5.5.4] / RDFa1.1[7.5.6]
579                 establishNewSubject(currentElement, currentEvaluationContext);
580             } else {
581                 // RDFa1.0[5.5.5] / RDFa1.1[7.5.7]
582                 establishNewSubjectCurrentObjectResource(
583                         currentElement,
584                         currentEvaluationContext
585                 );
586             }
587 
588             /*
589             if(currentEvaluationContext.newSubject == null) {
590                 currentEvaluationContext.newSubject = resolveIRI(documentBase.toExternalForm());
591             }
592             assert currentEvaluationContext.newSubject != null : "newSubject must be not null.";
593             */
594             if(currentEvaluationContext.newSubject == null)
595                 return;
596             if(logger.isDebugEnabled())
597                 logger.debug("newSubject: " + currentEvaluationContext.newSubject);
598 
599             // RDFa1.0[5.5.6] / RDFa1.1[7.5.8]
600             final IRI[] types = getTypes(currentElement);
601             for(IRI type : types) {
602                 writeTriple(currentEvaluationContext.newSubject, RDF.TYPE, type, extractionResult);
603             }
604 
605             // RDFa1.0[5.5.7] / RDFa1.1[7.5.9]
606             final IRI[] rels = getRels(currentElement);
607             final IRI[] revs = getRevs(currentElement);
608             if(currentEvaluationContext.currentObjectResource != null) {
609                 for (IRI rel : rels) {
610                     writeTriple(
611                             currentEvaluationContext.newSubject,
612                             rel,
613                             currentEvaluationContext.currentObjectResource,
614                             extractionResult
615                     );
616                 }
617                 for (IRI rev : revs) {
618                     writeTriple(
619                             currentEvaluationContext.currentObjectResource,
620                             rev,
621                             currentEvaluationContext.newSubject, extractionResult
622                     );
623                 }
624             } else { // RDFa1.0[5.5.8] / RDFa1.1[7.5.10]
625                 for(IRI rel : rels) {
626                     listOfIncompleteTriples.add(
627                             new IncompleteTriple(
628                                     currentElement,
629                                     currentEvaluationContext.newSubject,
630                                     rel,
631                                     IncompleteTripleDirection.Forward
632                             )
633                     );
634                 }
635                 for(IRI rev : revs) {
636                     listOfIncompleteTriples.add(
637                             new IncompleteTriple(
638                                     currentElement,
639                                     currentEvaluationContext.newSubject,
640                                     rev,
641                                     IncompleteTripleDirection.Reverse
642                             )
643                     );
644                 }
645             }
646 
647             // RDFa1.0[5.5.9] / RDFa1.1[7.5.11]
648             final Value currentObject = getCurrentObject(currentElement);
649             final IRI[] predicates = getPredicate(currentElement);
650             if (currentObject != null && predicates != null) {
651                 for (IRI predicate : predicates) {
652                     writeTriple(currentEvaluationContext.newSubject, predicate, currentObject, extractionResult);
653                 }
654             }
655 
656             // RDFa1.0[5.5.10] / RDFa1.1[7.5.12]
657             if(!currentEvaluationContext.skipElem && currentEvaluationContext.newSubject != null) {
658                 for (IncompleteTriple incompleteTriple : listOfIncompleteTriples) {
659                     incompleteTriple.produceTriple(
660                             currentElement,
661                             currentEvaluationContext.newSubject,
662                             extractionResult
663                     );
664                 }
665             }
666         } catch (Exception e) {
667             throw e;
668         } finally {
669             // RDFa1.0[5.5.11] / RDFa1.1[7.5.13]
670             if(currentEvaluationContext.recourse) {
671                 EvaluationContext newEvaluationContext = new EvaluationContext(currentEvaluationContext.base);
672                 if(currentEvaluationContext.skipElem) {
673                     newEvaluationContext.language = currentEvaluationContext.language;
674                 } else {
675                     newEvaluationContext.base = currentEvaluationContext.base;
676 
677                     if(currentEvaluationContext.newSubject != null) {
678                         newEvaluationContext.parentSubject = currentEvaluationContext.newSubject;
679                     } else {
680                         newEvaluationContext.parentSubject = currentEvaluationContext.parentSubject;
681                     }
682 
683                     if(currentEvaluationContext.currentObjectResource != null) {
684                         newEvaluationContext.parentObject = currentEvaluationContext.currentObjectResource;
685                     } else if(currentEvaluationContext.newSubject != null) {
686                         newEvaluationContext.parentObject = currentEvaluationContext.newSubject;
687                     } else {
688                         newEvaluationContext.parentObject = currentEvaluationContext.parentSubject;
689                     }
690 
691                     newEvaluationContext.language = currentEvaluationContext.language;
692                 }
693                 pushContext(currentElement, newEvaluationContext);
694             }
695         }
696     }
697 
698     /**
699      * Extract IRI namespaces (prefixes) from the current node.
700      *
701      * @param node
702      * @param prefixMapList
703      */
704     private void extractPrefixes(Node node, List<PrefixMap> prefixMapList) {
705         final String prefixAttribute = DomUtils.readAttribute(node, PREFIX_ATTRIBUTE, null);
706         if(prefixAttribute == null)
707             return;
708         final String[] prefixParts = extractPrefixSections(prefixAttribute);
709         for(String prefixPart : prefixParts) {
710             int splitPoint = prefixPart.indexOf(IRI_PREFIX_SEPARATOR);
711             final String prefix = prefixPart.substring(0, splitPoint);
712             if(prefix.length() == 0) {
713                 reportError(node, String.format("Invalid prefix length in prefix attribute '%s'", prefixAttribute));
714                 continue;
715             }
716             final IRI iri;
717             final String iriStr = prefixPart.substring(splitPoint + 1);
718             try {
719                 iri = resolveIRI(iriStr);
720             } catch (Exception e) {
721                 reportError(
722                         node,
723                         String.format(
724                                 "Resolution of prefix '%s' defines an invalid IRI: '%s'",
725                                 prefixAttribute, iriStr
726                         )
727                 );
728                 continue;
729             }
730             prefixMapList.add( new PrefixMap(prefix, iri) );
731         }
732     }
733 
734     /**
735      * Updates the current language.
736      *
737      * @param node
738      * @param currentEvaluationContext
739      */
740     private void updateLanguage(Node node, EvaluationContext currentEvaluationContext) {
741         final String candidateLanguage = DomUtils.readAttribute(node, XML_LANG_ATTRIBUTE, null);
742         if(candidateLanguage != null)
743             currentEvaluationContext.language = candidateLanguage;
744     }
745 
746     /**
747      * Establish the new subject for the current recursion.
748      * See <i>RDFa 1.0 Specification section 5.5.4</i>, <i>RDFa 1.1 Specification section 7.5.6</i>.
749      *
750      * @param node
751      * @param currentEvaluationContext
752      * @throws URISyntaxException
753      */
754     private void establishNewSubject(Node node, EvaluationContext currentEvaluationContext)
755     throws URISyntaxException {
756         String candidateIRIOrCURIE;
757         for(String subjectAttribute : SUBJECT_ATTRIBUTES) {
758             candidateIRIOrCURIE = DomUtils.readAttribute(node, subjectAttribute, null);
759             if(candidateIRIOrCURIE != null) {
760                 currentEvaluationContext.newSubject = resolveCURIEOrIRI(candidateIRIOrCURIE, false);
761                 return;
762             }
763         }
764 
765         if(node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) {
766             currentEvaluationContext.newSubject = resolveIRI(currentEvaluationContext.base.toString());
767             return;
768         }
769 
770         if(DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) {
771             currentEvaluationContext.newSubject = RDFUtils.bnode();
772             return;
773         }
774 
775         if(DomUtils.hasAttribute(node, PROPERTY_ATTRIBUTE)) {
776             currentEvaluationContext.skipElem = true;
777         }
778         if(currentEvaluationContext.parentObject != null) {
779             currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject;
780             return;
781         }
782 
783         currentEvaluationContext.newSubject = null;
784     }
785 
786     /**
787      * Establishes the new subject and the current object resource.
788      *
789      * See <i>RDFa 1.0 Specification section 5.5.5</i>, <i>RDFa 1.1 Specification section 7.5.7</i>.
790      *
791      * @param node
792      * @param currentEvaluationContext
793      * @throws URISyntaxException
794      */
795     private void establishNewSubjectCurrentObjectResource(Node node, EvaluationContext currentEvaluationContext)
796     throws URISyntaxException {
797         // Subject.
798         String candidateIRIOrCURIE;
799         candidateIRIOrCURIE = DomUtils.readAttribute(node, ABOUT_ATTRIBUTE, null);
800         if(candidateIRIOrCURIE != null) {
801             currentEvaluationContext.newSubject = resolveCURIEOrIRI(candidateIRIOrCURIE, false);
802         } else {
803             candidateIRIOrCURIE = DomUtils.readAttribute(node, SRC_ATTRIBUTE, null);
804             if (candidateIRIOrCURIE != null) {
805                 currentEvaluationContext.newSubject = resolveIRI(candidateIRIOrCURIE);
806             } else {
807                 if (node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) {
808                     currentEvaluationContext.newSubject = resolveIRI(currentEvaluationContext.base.toString());
809                 } else {
810                     if (DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) {
811                         currentEvaluationContext.newSubject = RDFUtils.bnode();
812                     } else {
813                         if (currentEvaluationContext.parentObject != null) {
814                             currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject;
815                         }
816                     }
817                 }
818             }
819         }
820 
821         // Object.
822         candidateIRIOrCURIE = DomUtils.readAttribute(node, RESOURCE_ATTRIBUTE, null);
823         if(candidateIRIOrCURIE != null) {
824             currentEvaluationContext.currentObjectResource = resolveCURIEOrIRI(candidateIRIOrCURIE, false);
825             return;
826         }
827 
828         candidateIRIOrCURIE = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null);
829         if(candidateIRIOrCURIE != null) {
830             currentEvaluationContext.currentObjectResource = resolveIRI(candidateIRIOrCURIE);
831             return;
832         }
833         currentEvaluationContext.currentObjectResource = null;
834     }
835 
836     private IRI[] getTypes(Node node) throws URISyntaxException {
837         final String typeOf = DomUtils.readAttribute(node, TYPEOF_ATTRIBUTE, null);
838         return resolveCIRIeOrIRIList(node, typeOf, true);
839     }
840 
841     private IRI[] getRels(Node node) throws URISyntaxException {
842         final String rel = DomUtils.readAttribute(node, REL_ATTRIBUTE, null);
843         return resolveCIRIeOrIRIList(node, rel, true);
844     }
845 
846     private IRI[] getRevs(Node node) throws URISyntaxException {
847         final String rev = DomUtils.readAttribute(node, REV_ATTRIBUTE, null);
848         return resolveCIRIeOrIRIList(node, rev, true);
849     }
850 
851     private IRI[] getPredicate(Node node) throws URISyntaxException {
852         final String candidateIRI = DomUtils.readAttribute(node, PROPERTY_ATTRIBUTE, null);
853         if(candidateIRI == null)
854             return null;
855         return resolveCIRIeOrIRIList(node, candidateIRI, true);
856     }
857 
858     /**
859      * Establishes the new object value.
860      * See <i>RDFa 1.0 Specification section 5.5.9</i>, <i>RDFa 1.1 Specification section 7.5.11</i>.
861      *
862      * @param node
863      * @return
864      * @throws URISyntaxException
865      * @throws IOException
866      * @throws TransformerException
867      */
868     private Value getCurrentObject(Node node)
869     throws URISyntaxException, IOException, TransformerException {
870         final String candidateObject = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null);
871         if(candidateObject != null) {
872             return resolveIRI(candidateObject);
873         } else {
874             return gerCurrentObjectLiteral(node);
875         }
876     }
877 
878     private Literal gerCurrentObjectLiteral(Node node)
879     throws URISyntaxException, IOException, TransformerException {
880         final EvaluationContext currentEvaluationContext = getContext();
881         Literal literal;
882 
883         literal = getAsTypedLiteral(node);
884         if(literal != null)
885             return literal;
886 
887         literal = getAsXMLLiteral(node);
888         if(literal != null) {
889             currentEvaluationContext.recourse = false;
890             return literal;
891         }
892 
893         literal = getAsPlainLiteral(node, currentEvaluationContext.language);
894         if(literal != null)
895             return literal;
896 
897         return null;
898     }
899 
900     private static String getNodeContent(Node node) {
901         final String candidateContent = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null);
902         if(candidateContent != null)
903             return candidateContent;
904         return node.getTextContent();
905     }
906 
907     /**
908      * Extracts the current typed literal from the given node.
909      * See <i>RDFa 1.0 Specification section 5.5.9.1</i>.
910      *
911      * @param node
912      * @return
913      * @throws URISyntaxException
914      */
915     private Literal getAsTypedLiteral(Node node) throws URISyntaxException {
916         final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null);
917         if (datatype == null || datatype.trim().length() == 0 || XML_LITERAL_DATATYPE.equals(datatype.trim()) ) {
918             return null;
919         }
920         final Resource curieOrIRI = resolveCURIEOrIRI(datatype, true);
921         return RDFUtils.literal(getNodeContent(node), curieOrIRI instanceof IRI ? (IRI) curieOrIRI : null);
922     }
923 
924     private void pushMappings(Node sourceNode, List<PrefixMap> prefixMapList) {
925 
926         final Map<String, IRI> mapping = new HashMap<>();
927         for (PrefixMap prefixMap : prefixMapList) {
928             mapping.put(prefixMap.prefix, prefixMap.IRI);
929         }
930         IRIMappingStack.push( new IRIMapping(sourceNode, mapping) );
931     }
932 
933     private void popMappings(Node node) {
934         if(IRIMappingStack.isEmpty())
935             return;
936         final IRIMapping peek = IRIMappingStack.peek();
937         if( ! DomUtils.isAncestorOf(peek.sourceNode, node) ) {
938             IRIMappingStack.pop();
939         }
940     }
941 
942     /**
943      * Resolve a namespaced IRI, if <code>safe</code> is <code>true</code>
944      * then the mapping must define a prefix, otherwise it is considered relative.
945      *
946      * @param mapping
947      * @param resolutionPolicy
948      * @return
949      */
950     private Resource resolveNamespacedIRI(String mapping, ResolutionPolicy resolutionPolicy) {
951         if(mapping.indexOf(IRI_PATH_SEPARATOR) == 0) { // Begins with '/'
952             mapping = mapping.substring(1);
953         }
954 
955         final int prefixSeparatorIndex = mapping.indexOf(':');
956         if(prefixSeparatorIndex == -1) { // there is no prefix separator.
957             if(resolutionPolicy == ResolutionPolicy.NSRequired) {
958                 throw new IllegalArgumentException(
959                         String.format("Invalid mapping string [%s], must declare a prefix.", mapping)
960                 );
961             }
962             if (resolutionPolicy == ResolutionPolicy.TermAllowed) {
963                 final IRI currentVocabulary = getVocabulary();
964                 // Mapping is a TERM.
965                 if (currentVocabulary != null) {
966                     return resolveIRI(currentVocabulary.toString() + mapping);
967                 }
968             }
969             return resolveIRI(documentBase.toString() + mapping);
970         }
971 
972         final String prefix = mapping.substring(0, prefixSeparatorIndex);
973         final IRI curieMapping = getMapping(prefix);
974         if(curieMapping == null) {
975             throw new IllegalArgumentException( String.format("Cannot map prefix '%s'", prefix) );
976         }
977         final String candidateCURIEStr = curieMapping.toString() + mapping.substring(prefixSeparatorIndex + 1);
978         final java.net.URI candidateCURIE;
979         try {
980             candidateCURIE = new java.net.URI(candidateCURIEStr);
981         } catch (URISyntaxException IRIse) {
982             throw new IllegalArgumentException(String.format("Invalid CURIE '%s'", candidateCURIEStr) );
983         }
984         return resolveIRI(
985                 candidateCURIE.isAbsolute()
986                         ?
987                         candidateCURIE.toString()
988                         :
989                         documentBase.toString() + candidateCURIE.toString()
990         );
991     }
992 
993     /**
994      * The resolution policy provided to the method {@link #resolveNamespacedIRI(String, ResolutionPolicy)}.
995      */
996     enum ResolutionPolicy {
997         NSNotRequired,
998         NSRequired,
999         TermAllowed
1000     }
1001 
1002     /**
1003      * Defines an evaluation context.
1004      */
1005     private class EvaluationContext {
1006         private Node node;
1007         private URL base;
1008         private Resource parentSubject;
1009         private Value parentObject;
1010         private String language;
1011         private boolean recourse;
1012         private boolean skipElem;
1013         private Resource newSubject;
1014         private Resource currentObjectResource;
1015 
1016         /**
1017          * Sections <em>RDFa1.0[5.5]</em>, <em>RDFa1.0[5.5.1]</em>, <em>RDFa1.1[7.5.1]</em> .
1018          *
1019          * @param base
1020          */
1021         EvaluationContext(URL base) {
1022             this.base             = base;
1023             this.parentSubject    = resolveIRI( base.toExternalForm() );
1024             this.parentObject     = null;
1025             this.language         = null;
1026             this.recourse         = true;
1027             this.skipElem         = false;
1028             this.newSubject       = null;
1029             this.currentObjectResource = null;
1030         }
1031     }
1032 
1033     /**
1034      * Defines a prefix mapping.
1035      */
1036     private class PrefixMap {
1037         final String prefix;
1038         final IRI    IRI;
1039         public PrefixMap(String prefix, IRI IRI) {
1040             this.prefix = prefix;
1041             this.IRI = IRI;
1042         }
1043     }
1044 
1045     /**
1046      * Defines a IRI mapping.
1047      */
1048     private class IRIMapping {
1049         final Node sourceNode;
1050         final Map<String, IRI> map;
1051 
1052         public IRIMapping(Node sourceNode, Map<String, IRI> map) {
1053             this.sourceNode = sourceNode;
1054             this.map        = map;
1055         }
1056     }
1057 
1058     /**
1059      * Defines the direction of an {@link IncompleteTriple}.
1060      */
1061     private enum IncompleteTripleDirection {
1062         Forward,
1063         Reverse
1064     }
1065 
1066     /**
1067      * Defines an incomplete triple.
1068      */
1069     private class IncompleteTriple {
1070         final Node     originatingNode;
1071         final Resource subject;
1072         final IRI      predicate;
1073         final IncompleteTripleDirection direction;
1074 
1075         public IncompleteTriple(
1076                 Node originatingNode,
1077                 Resource subject,
1078                 IRI predicate,
1079                 IncompleteTripleDirection direction
1080         ) {
1081             if(originatingNode == null || subject == null || predicate == null || direction == null)
1082                 throw new IllegalArgumentException();
1083 
1084             this.originatingNode = originatingNode;
1085             this.subject         = subject;
1086             this.predicate       = predicate;
1087             this.direction       = direction;
1088         }
1089 
1090         public boolean produceTriple(Node resourceNode, Resource r, ExtractionResult extractionResult) {
1091             if( ! DomUtils.isAncestorOf(originatingNode, resourceNode, true) )
1092                 return false;
1093 
1094             if(r == null)
1095                 throw new IllegalArgumentException();
1096             switch (direction) {
1097                 case Forward:
1098                     extractionResult.writeTriple(subject, predicate, r);
1099                     break;
1100                 case Reverse:
1101                     extractionResult.writeTriple(r, predicate, subject);
1102                     break;
1103                 default:
1104                     throw new IllegalStateException();
1105             }
1106             return true;
1107         }
1108     }
1109 
1110     /**
1111      * Defines a vocabulary object.
1112      */
1113     private class Vocabulary {
1114         final Node originatingNode;
1115         final IRI prefix;
1116 
1117         public Vocabulary(Node originatingNode, IRI prefix) {
1118             this.originatingNode = originatingNode;
1119             this.prefix = prefix;
1120         }
1121     }
1122 
1123 }