1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.rdfa;
19
20 import org.apache.any23.extractor.IssueReport;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.html.DomUtils;
23 import org.apache.any23.rdf.RDFUtils;
24 import org.openrdf.model.Literal;
25 import org.openrdf.model.Resource;
26 import org.openrdf.model.URI;
27 import org.openrdf.model.Value;
28 import org.openrdf.model.vocabulary.RDF;
29 import org.slf4j.Logger;
30 import org.slf4j.LoggerFactory;
31 import org.w3c.dom.Document;
32 import org.w3c.dom.NamedNodeMap;
33 import org.w3c.dom.Node;
34 import org.w3c.dom.NodeList;
35
36 import javax.xml.transform.TransformerException;
37 import java.io.IOException;
38 import java.net.MalformedURLException;
39 import java.net.URISyntaxException;
40 import java.net.URL;
41 import java.util.ArrayList;
42 import java.util.HashMap;
43 import java.util.List;
44 import java.util.Map;
45 import java.util.Stack;
46
47
48
49
50
51
52
53 public class RDFa11Parser {
54
55 private static final Logger logger = LoggerFactory.getLogger(RDFa11Parser.class);
56
57 public static final String CURIE_SEPARATOR = ":";
58 public static final char URI_PREFIX_SEPARATOR = ':';
59 public static final String URI_SCHEMA_SEPARATOR = "://";
60 public static final String URI_PATH_SEPARATOR = "/";
61
62 public static final String HEAD_TAG = "HEAD";
63 public static final String BODY_TAG = "BODY";
64
65 public static final String XMLNS_ATTRIBUTE = "xmlns";
66 public static final String XML_LANG_ATTRIBUTE = "xml:lang";
67
68 public static final String REL_ATTRIBUTE = "rel";
69 public static final String REV_ATTRIBUTE = "rev";
70
71 public static final String ABOUT_ATTRIBUTE = "about";
72 public static final String RESOURCE_ATTRIBUTE = "resource";
73 public static final String SRC_ATTRIBUTE = "src";
74 public static final String HREF_ATTRIBUTE = "href";
75
76 public static final String TYPE_ATTRIBUTE = "type";
77 public static final String ATTRIBUTE_CSS = "text/css";
78
79 public static final String[] SUBJECT_ATTRIBUTES = {
80 ABOUT_ATTRIBUTE,
81 SRC_ATTRIBUTE,
82 RESOURCE_ATTRIBUTE,
83 HREF_ATTRIBUTE
84 };
85
86 public static final String PREFIX_ATTRIBUTE = "prefix";
87 public static final String TYPEOF_ATTRIBUTE = "typeof";
88 public static final String PROPERTY_ATTRIBUTE = "property";
89 public static final String DATATYPE_ATTRIBUTE = "datatype";
90 public static final String CONTENT_ATTRIBUTE = "content";
91 public static final String VOCAB_ATTRIBUTE = "vocab";
92
93 public static final String PROFILE_ATTRIBUTE = "profile";
94
95 public static final String XML_LITERAL_DATATYPE = "rdf:XMLLiteral";
96
97 public static final String XMLNS_DEFAULT = "http://www.w3.org/1999/xhtml";
98
99 private IssueReport issueReport;
100
101 private URL documentBase;
102
103 private final Stack<URIMapping> uriMappingStack = new Stack<URIMapping>();
104
105 private final Stack<Vocabulary> vocabularyStack = new Stack<Vocabulary>();
106
107 private final List<IncompleteTriple> listOfIncompleteTriples = new ArrayList<IncompleteTriple>();
108
109 private final Stack<EvaluationContext> evaluationContextStack = new Stack<EvaluationContext>();
110
111 protected static URL getDocumentBase(URL documentURL, Document document) throws MalformedURLException {
112 String base;
113 base = DomUtils.find(document, "/HTML/HEAD/BASE/@href");
114 if( ! "".equals(base) ) return new URL(base);
115 base = DomUtils.find(document, "//*/h:head/h:base[position()=1]/@href"); // XHTML documents.
116 if( ! "".equals(base) ) return new URL(base);
117 return documentURL;
118 }
119
120
121
122
123
124
125
126
127 protected static String[] extractPrefixSections(String prefixesDeclaration) {
128 final String[] parts = prefixesDeclaration.split("\\s");
129 final List<String> out = new ArrayList<String>();
130 int i = 0;
131 while(i < parts.length) {
132 final String part = parts[i];
133 if(part.length() == 0) {
134 i++;
135 continue;
136 }
137 if(part.charAt( part.length() -1 ) == URI_PREFIX_SEPARATOR) {
138 i++;
139 while(i < parts.length && parts[i].length() == 0) i++;
140 out.add( part + (i < parts.length ? parts[i] : "") );
141 i++;
142 } else {
143 out.add(parts[i]);
144 i++;
145 }
146 }
147 return out.toArray( new String[out.size()] );
148 }
149
150 protected static boolean isAbsoluteURI(String uri) {
151 return uri.contains(URI_SCHEMA_SEPARATOR);
152 }
153
154 protected static boolean isCURIE(String curie) {
155 if(curie == null) {
156 throw new NullPointerException("curie string cannot be null.");
157 }
158 if(curie.trim().length() == 0) return false;
159
160
161 if( curie.charAt(0) != '[' || curie.charAt(curie.length() -1) != ']') return false;
162 int separatorIndex = curie.indexOf(CURIE_SEPARATOR);
163 return separatorIndex > 0 && curie.indexOf(CURIE_SEPARATOR, separatorIndex + 1) == -1;
164 }
165
166 protected static boolean isCURIEBNode(String curie) {
167 return isCURIE(curie) && curie.substring(1, curie.length() -1).split(CURIE_SEPARATOR)[0].equals("_");
168 }
169
170 protected static boolean isRelativeNode(Node node) {
171 if( ATTRIBUTE_CSS.equals( DomUtils.readAttribute(node, TYPE_ATTRIBUTE) ) ) return false;
172 return DomUtils.hasAttribute(node, REL_ATTRIBUTE) || DomUtils.hasAttribute(node, REV_ATTRIBUTE);
173 }
174
175
176 protected static Literal getAsPlainLiteral(Node node, String currentLanguage) {
177 final String content = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null);
178 if(content != null) return RDFUtils.literal(content, currentLanguage);
179
180 if(! node.hasChildNodes() ) return RDFUtils.literal("", currentLanguage);
181
182 final String nodeTextContent = node.getTextContent();
183 return nodeTextContent == null ? null : RDFUtils.literal(nodeTextContent.trim(), currentLanguage);
184 }
185
186 protected static Literal getAsXMLLiteral(Node node) throws IOException, TransformerException {
187 final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null);
188 if(! XML_LITERAL_DATATYPE.equals(datatype)) return null;
189
190 final String xmlSerializedNode = DomUtils.serializeToXML(node, false);
191 return RDFUtils.literal(xmlSerializedNode, RDF.XMLLITERAL);
192 }
193
194 protected static boolean isXMLNSDeclared(Document document) {
195 final String attributeValue = document.getDocumentElement().getAttribute(XMLNS_ATTRIBUTE);
196 if(attributeValue.length() == 0) return false;
197 return XMLNS_DEFAULT.equals(attributeValue);
198 }
199
200 public RDFa11Parser() {}
201
202
203
204
205
206
207
208
209 public void processDocument(URL documentURL, Document document, ExtractionResult extractionResult)
210 throws RDFa11ParserException {
211 try {
212 this.issueReport = extractionResult;
213
214
215 if( ! isXMLNSDeclared(document)) {
216 reportError(
217 document.getDocumentElement(),
218 String.format(
219 "The default %s namespace is expected to be declared and equal to '%s' .",
220 XMLNS_ATTRIBUTE, XMLNS_DEFAULT
221 )
222 );
223 }
224
225 try {
226 documentBase = getDocumentBase(documentURL, document);
227 } catch (MalformedURLException murle) {
228 throw new RDFa11ParserException("Invalid document base URL.", murle);
229 }
230
231
232 pushContext(document, new EvaluationContext(documentBase));
233
234 depthFirstNode(document, extractionResult);
235
236 assert listOfIncompleteTriples.isEmpty()
237 :
238 "The list of incomplete triples is expected to be empty at the end of processing.";
239 } finally {
240 reset();
241 }
242 }
243
244
245
246
247 public void reset() {
248 issueReport = null;
249 documentBase = null;
250 uriMappingStack.clear();
251 listOfIncompleteTriples.clear();
252 evaluationContextStack.clear();
253 }
254
255
256
257
258
259
260 protected void updateVocabulary(Node currentNode) {
261 final String vocabularyStr = DomUtils.readAttribute(currentNode, VOCAB_ATTRIBUTE, null);
262 if(vocabularyStr == null) return;
263 try {
264 pushVocabulary(currentNode, RDFUtils.uri(vocabularyStr));
265 } catch (Exception e) {
266 reportError(currentNode, String.format("Invalid vocabulary [%s], must be a URI.", vocabularyStr));
267 }
268 }
269
270
271
272
273
274
275 protected void updateURIMapping(Node node) {
276 final NamedNodeMap attributes = node.getAttributes();
277 if (null == attributes) return;
278
279 Node attribute;
280 final List<PrefixMap> prefixMapList = new ArrayList<PrefixMap>();
281 final String namespacePrefix = XMLNS_ATTRIBUTE + URI_PREFIX_SEPARATOR;
282 for (int a = 0; a < attributes.getLength(); a++) {
283 attribute = attributes.item(a);
284 if (attribute.getNodeName().startsWith(namespacePrefix)) {
285 prefixMapList.add(
286 new PrefixMap(
287 attribute.getNodeName().substring(namespacePrefix.length()),
288 resolveURI(attribute.getNodeValue())
289 )
290 );
291 }
292 }
293
294 extractPrefixes(node, prefixMapList);
295
296 if(prefixMapList.size() == 0) return;
297 pushMappings(
298 node,
299 prefixMapList
300 );
301 }
302
303
304
305
306
307
308
309 protected URI getMapping(String prefix) {
310 for (URIMapping uriMapping : uriMappingStack) {
311 final URI mapping = uriMapping.map.get(prefix);
312 if (mapping != null) {
313 return mapping;
314 }
315 }
316 return null;
317 }
318
319
320
321
322
323
324
325
326
327 protected URI[] resolveCurieOrURIList(Node n, String curieOrURIList, boolean termAllowed)
328 throws URISyntaxException {
329 if(curieOrURIList == null || curieOrURIList.trim().length() == 0) return new URI[0];
330
331 final String[] curieOrURIListParts = curieOrURIList.split("\\s");
332 final List<URI> result = new ArrayList<URI>();
333 Resource curieOrURI;
334 for(String curieORURIListPart : curieOrURIListParts) {
335 curieOrURI = resolveCURIEOrURI(curieORURIListPart, termAllowed);
336 if(curieOrURI != null && curieOrURI instanceof URI) {
337 result.add((URI) curieOrURI);
338 } else {
339 reportError(n, String.format("Invalid CURIE '%s' : expected URI, found BNode.", curieORURIListPart));
340 }
341 }
342 return result.toArray(new URI[result.size()]);
343 }
344
345
346
347
348
349
350
351 protected URI resolveURI(String uriStr) {
352 return
353 isAbsoluteURI(uriStr)
354 ?
355 RDFUtils.uri(uriStr)
356 :
357 RDFUtils.uri( this.documentBase.toExternalForm(), uriStr );
358 }
359
360
361
362
363
364
365
366
367 protected Resource resolveCURIEOrURI(String curieOrURI, boolean termAllowed) {
368 if( isCURIE(curieOrURI) ) {
369 return resolveNamespacedURI(curieOrURI.substring(1, curieOrURI.length() - 1), ResolutionPolicy.NSRequired);
370 }
371 if(isAbsoluteURI(curieOrURI)) return resolveURI(curieOrURI);
372 return resolveNamespacedURI(
373 curieOrURI,
374 termAllowed ? ResolutionPolicy.TermAllowed : ResolutionPolicy.NSNotRequired
375 );
376 }
377
378
379
380
381
382
383
384 private void pushContext(Node current, EvaluationContext ec) {
385 ec.node = current;
386 evaluationContextStack.push(ec);
387 }
388
389
390
391
392 private EvaluationContext getContext() {
393 return evaluationContextStack.peek();
394 }
395
396
397
398
399
400
401 private void popContext(Node current) {
402 final Node peekNode = evaluationContextStack.peek().node;
403 if(DomUtils.isAncestorOf(peekNode, current)) {
404 evaluationContextStack.pop();
405 }
406 }
407
408
409
410
411
412
413
414 private void pushVocabulary(Node currentNode, URI vocab) {
415 vocabularyStack.push( new Vocabulary(currentNode, vocab) );
416 }
417
418
419
420
421 private URI getVocabulary() {
422 if(vocabularyStack.isEmpty()) return null;
423 return vocabularyStack.peek().prefix;
424 }
425
426
427
428
429
430
431 private void popVocabulary(Node current) {
432 if(vocabularyStack.isEmpty()) return;
433 if(DomUtils.isAncestorOf(current, vocabularyStack.peek().originatingNode)) {
434 vocabularyStack.pop();
435 }
436 }
437
438
439
440
441
442
443 private void purgeIncompleteTriples(Node current) {
444 final List<IncompleteTriple> toBePurged = new ArrayList<IncompleteTriple>();
445 for(IncompleteTriple incompleteTriple : listOfIncompleteTriples) {
446 if( DomUtils.isAncestorOf(current, incompleteTriple.originatingNode, true) ) {
447 toBePurged.add(incompleteTriple);
448 }
449 }
450 listOfIncompleteTriples.removeAll(toBePurged);
451 toBePurged.clear();
452 }
453
454
455
456
457
458
459
460 private void reportError(Node n, String msg) {
461 final String errorMsg = String.format(
462 "Error while processing node [%s] : '%s'",
463 DomUtils.getXPathForNode(n), msg
464 );
465 final int[] errorLocation = DomUtils.getNodeLocation(n);
466 this.issueReport.notifyIssue(
467 IssueReport.IssueLevel.Warning,
468 errorMsg,
469 errorLocation == null ? -1 : errorLocation[0],
470 errorLocation == null ? -1 : errorLocation[1]
471 );
472 }
473
474
475
476
477
478
479
480 private void depthFirstNode(Node node, ExtractionResult extractionResult) {
481 try {
482 processNode(node, extractionResult);
483 } catch (Exception e) {
484 if(logger.isDebugEnabled()) logger.debug("Error while processing node.", e);
485 reportError(node, e.getMessage());
486
487 }
488 depthFirstChildren(node.getChildNodes(), extractionResult);
489 purgeIncompleteTriples(node);
490 }
491
492
493
494
495
496
497
498 private void depthFirstChildren(NodeList nodeList, ExtractionResult extractionResult) {
499 for(int i = 0; i < nodeList.getLength(); i++) {
500 final Node child = nodeList.item(i);
501 depthFirstNode(child, extractionResult);
502 popMappings(child);
503 popVocabulary(child);
504 popContext(child);
505 }
506 }
507
508
509
510
511
512
513
514
515
516 private void writeTriple(Resource s, URI p, Value o, ExtractionResult extractionResult) {
517
518 assert s != null : "subject is null.";
519 assert p != null : "predicate is null.";
520 assert o != null : "object is null.";
521 extractionResult.writeTriple(s, p, o);
522 }
523
524
525
526
527
528
529
530
531
532
533
534
535 private void processNode(Node currentElement, ExtractionResult extractionResult) throws Exception {
536
537 final EvaluationContext currentEvaluationContext = getContext();
538 try {
539 if(
540 currentElement.getNodeType() != Node.DOCUMENT_NODE
541 &&
542 currentElement.getNodeType() != Node.ELEMENT_NODE
543 ) return;
544
545
546 updateVocabulary(currentElement);
547
548
549
550 updateURIMapping(currentElement);
551
552
553 updateLanguage(currentElement, currentEvaluationContext);
554
555 if(! isRelativeNode(currentElement)) {
556
557 establishNewSubject(currentElement, currentEvaluationContext);
558 } else {
559
560 establishNewSubjectCurrentObjectResource(
561 currentElement,
562 currentEvaluationContext
563 );
564 }
565
566
567
568
569
570
571
572 if(currentEvaluationContext.newSubject == null) return;
573 if(logger.isDebugEnabled()) logger.debug("newSubject: " + currentEvaluationContext.newSubject);
574
575
576 final URI[] types = getTypes(currentElement);
577 for(URI type : types) {
578 writeTriple(currentEvaluationContext.newSubject, RDF.TYPE, type, extractionResult);
579 }
580
581
582 final URI[] rels = getRels(currentElement);
583 final URI[] revs = getRevs(currentElement);
584 if(currentEvaluationContext.currentObjectResource != null) {
585 for (URI rel : rels) {
586 writeTriple(
587 currentEvaluationContext.newSubject,
588 rel,
589 currentEvaluationContext.currentObjectResource,
590 extractionResult
591 );
592 }
593 for (URI rev : revs) {
594 writeTriple(
595 currentEvaluationContext.currentObjectResource,
596 rev,
597 currentEvaluationContext.newSubject, extractionResult
598 );
599 }
600 } else {
601 for(URI rel : rels) {
602 listOfIncompleteTriples.add(
603 new IncompleteTriple(
604 currentElement,
605 currentEvaluationContext.newSubject,
606 rel,
607 IncompleteTripleDirection.Forward
608 )
609 );
610 }
611 for(URI rev : revs) {
612 listOfIncompleteTriples.add(
613 new IncompleteTriple(
614 currentElement,
615 currentEvaluationContext.newSubject,
616 rev,
617 IncompleteTripleDirection.Reverse
618 )
619 );
620 }
621 }
622
623
624 final Value currentObject = getCurrentObject(currentElement);
625 final URI[] predicates = getPredicate(currentElement);
626 if (currentObject != null && predicates != null) {
627 for (URI predicate : predicates) {
628 writeTriple(currentEvaluationContext.newSubject, predicate, currentObject, extractionResult);
629 }
630 }
631
632
633 if(!currentEvaluationContext.skipElem && currentEvaluationContext.newSubject != null) {
634 for (IncompleteTriple incompleteTriple : listOfIncompleteTriples) {
635 incompleteTriple.produceTriple(
636 currentElement,
637 currentEvaluationContext.newSubject,
638 extractionResult
639 );
640 }
641 }
642 } catch (Exception e) {
643 throw e;
644 } finally {
645
646 if(currentEvaluationContext.recourse) {
647 EvaluationContext newEvaluationContext = new EvaluationContext(currentEvaluationContext.base);
648 if(currentEvaluationContext.skipElem) {
649 newEvaluationContext.language = currentEvaluationContext.language;
650 } else {
651 newEvaluationContext.base = currentEvaluationContext.base;
652
653 if(currentEvaluationContext.newSubject != null) {
654 newEvaluationContext.parentSubject = currentEvaluationContext.newSubject;
655 } else {
656 newEvaluationContext.parentSubject = currentEvaluationContext.parentSubject;
657 }
658
659 if(currentEvaluationContext.currentObjectResource != null) {
660 newEvaluationContext.parentObject = currentEvaluationContext.currentObjectResource;
661 } else if(currentEvaluationContext.newSubject != null) {
662 newEvaluationContext.parentObject = currentEvaluationContext.newSubject;
663 } else {
664 newEvaluationContext.parentObject = currentEvaluationContext.parentSubject;
665 }
666
667 newEvaluationContext.language = currentEvaluationContext.language;
668 }
669 pushContext(currentElement, newEvaluationContext);
670 }
671 }
672 }
673
674
675
676
677
678
679
680 private void extractPrefixes(Node node, List<PrefixMap> prefixMapList) {
681 final String prefixAttribute = DomUtils.readAttribute(node, PREFIX_ATTRIBUTE, null);
682 if(prefixAttribute == null) return;
683 final String[] prefixParts = extractPrefixSections(prefixAttribute);
684 for(String prefixPart : prefixParts) {
685 int splitPoint = prefixPart.indexOf(URI_PREFIX_SEPARATOR);
686 final String prefix = prefixPart.substring(0, splitPoint);
687 if(prefix.length() == 0) {
688 reportError(node, String.format("Invalid prefix length in prefix attribute '%s'", prefixAttribute));
689 continue;
690 }
691 final URI uri;
692 final String uriStr = prefixPart.substring(splitPoint + 1);
693 try {
694 uri = resolveURI(uriStr);
695 } catch (Exception e) {
696 reportError(
697 node,
698 String.format(
699 "Resolution of prefix '%s' defines an invalid URI: '%s'",
700 prefixAttribute, uriStr
701 )
702 );
703 continue;
704 }
705 prefixMapList.add( new PrefixMap(prefix, uri) );
706 }
707 }
708
709
710
711
712
713
714
715 private void updateLanguage(Node node, EvaluationContext currentEvaluationContext) {
716 final String candidateLanguage = DomUtils.readAttribute(node, XML_LANG_ATTRIBUTE, null);
717 if(candidateLanguage != null) currentEvaluationContext.language = candidateLanguage;
718 }
719
720
721
722
723
724
725
726
727
728 private void establishNewSubject(Node node, EvaluationContext currentEvaluationContext)
729 throws URISyntaxException {
730 String candidateURIOrCURIE;
731 for(String subjectAttribute : SUBJECT_ATTRIBUTES) {
732 candidateURIOrCURIE = DomUtils.readAttribute(node, subjectAttribute, null);
733 if(candidateURIOrCURIE != null) {
734 currentEvaluationContext.newSubject = resolveCURIEOrURI(candidateURIOrCURIE, false);
735 return;
736 }
737 }
738
739 if(node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) {
740 currentEvaluationContext.newSubject = resolveURI(currentEvaluationContext.base.toString());
741 return;
742 }
743
744 if(DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) {
745 currentEvaluationContext.newSubject = RDFUtils.bnode();
746 return;
747 }
748
749 if(DomUtils.hasAttribute(node, PROPERTY_ATTRIBUTE)) {
750 currentEvaluationContext.skipElem = true;
751 }
752 if(currentEvaluationContext.parentObject != null) {
753 currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject;
754 return;
755 }
756
757 currentEvaluationContext.newSubject = null;
758 }
759
760
761
762
763
764
765
766
767
768
769 private void establishNewSubjectCurrentObjectResource(Node node, EvaluationContext currentEvaluationContext)
770 throws URISyntaxException {
771
772 String candidateURIOrCURIE;
773 candidateURIOrCURIE = DomUtils.readAttribute(node, ABOUT_ATTRIBUTE, null);
774 if(candidateURIOrCURIE != null) {
775 currentEvaluationContext.newSubject = resolveCURIEOrURI(candidateURIOrCURIE, false);
776 } else {
777 candidateURIOrCURIE = DomUtils.readAttribute(node, SRC_ATTRIBUTE, null);
778 if (candidateURIOrCURIE != null) {
779 currentEvaluationContext.newSubject = resolveURI(candidateURIOrCURIE);
780 } else {
781 if (node.getNodeName().equalsIgnoreCase(HEAD_TAG) || node.getNodeName().equalsIgnoreCase(BODY_TAG)) {
782 currentEvaluationContext.newSubject = resolveURI(currentEvaluationContext.base.toString());
783 } else {
784 if (DomUtils.hasAttribute(node, TYPEOF_ATTRIBUTE)) {
785 currentEvaluationContext.newSubject = RDFUtils.bnode();
786 } else {
787 if (currentEvaluationContext.parentObject != null) {
788 currentEvaluationContext.newSubject = (Resource) currentEvaluationContext.parentObject;
789 }
790 }
791 }
792 }
793 }
794
795
796 candidateURIOrCURIE = DomUtils.readAttribute(node, RESOURCE_ATTRIBUTE, null);
797 if(candidateURIOrCURIE != null) {
798 currentEvaluationContext.currentObjectResource = resolveCURIEOrURI(candidateURIOrCURIE, false);
799 return;
800 }
801
802 candidateURIOrCURIE = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null);
803 if(candidateURIOrCURIE != null) {
804 currentEvaluationContext.currentObjectResource = resolveURI(candidateURIOrCURIE);
805 return;
806 }
807 currentEvaluationContext.currentObjectResource = null;
808 }
809
810 private URI[] getTypes(Node node) throws URISyntaxException {
811 final String typeOf = DomUtils.readAttribute(node, TYPEOF_ATTRIBUTE, null);
812 return resolveCurieOrURIList(node, typeOf, true);
813 }
814
815 private URI[] getRels(Node node) throws URISyntaxException {
816 final String rel = DomUtils.readAttribute(node, REL_ATTRIBUTE, null);
817 return resolveCurieOrURIList(node, rel, true);
818 }
819
820 private URI[] getRevs(Node node) throws URISyntaxException {
821 final String rev = DomUtils.readAttribute(node, REV_ATTRIBUTE, null);
822 return resolveCurieOrURIList(node, rev, true);
823 }
824
825 private URI[] getPredicate(Node node) throws URISyntaxException {
826 final String candidateURI = DomUtils.readAttribute(node, PROPERTY_ATTRIBUTE, null);
827 if(candidateURI == null) return null;
828 return resolveCurieOrURIList(node, candidateURI, true);
829 }
830
831
832
833
834
835
836
837
838
839
840
841 private Value getCurrentObject(Node node)
842 throws URISyntaxException, IOException, TransformerException {
843 final String candidateObject = DomUtils.readAttribute(node, HREF_ATTRIBUTE, null);
844 if(candidateObject != null) {
845 return resolveURI(candidateObject);
846 } else {
847 return gerCurrentObjectLiteral(node);
848 }
849 }
850
851 private Literal gerCurrentObjectLiteral(Node node)
852 throws URISyntaxException, IOException, TransformerException {
853 final EvaluationContext currentEvaluationContext = getContext();
854 Literal literal;
855
856 literal = getAsTypedLiteral(node);
857 if(literal != null) return literal;
858
859 literal = getAsXMLLiteral(node);
860 if(literal != null) {
861 currentEvaluationContext.recourse = false;
862 return literal;
863 }
864
865 literal = getAsPlainLiteral(node, currentEvaluationContext.language);
866 if(literal != null) return literal;
867
868 return null;
869 }
870
871 private static String getNodeContent(Node node) {
872 final String candidateContent = DomUtils.readAttribute(node, CONTENT_ATTRIBUTE, null);
873 if(candidateContent != null) return candidateContent;
874 return node.getTextContent();
875 }
876
877
878
879
880
881
882
883
884
885 private Literal getAsTypedLiteral(Node node) throws URISyntaxException {
886 final String datatype = DomUtils.readAttribute(node, DATATYPE_ATTRIBUTE, null);
887 if (datatype == null || datatype.trim().length() == 0 || XML_LITERAL_DATATYPE.equals(datatype.trim()) ) {
888 return null;
889 }
890 final Resource curieOrURI = resolveCURIEOrURI(datatype, true);
891 return RDFUtils.literal(getNodeContent(node), curieOrURI instanceof URI ? (URI) curieOrURI : null);
892 }
893
894 private void pushMappings(Node sourceNode, List<PrefixMap> prefixMapList) {
895
896
897 final Map<String, URI> mapping = new HashMap<String, URI>();
898 for (PrefixMap prefixMap : prefixMapList) {
899 mapping.put(prefixMap.prefix, prefixMap.uri);
900 }
901 uriMappingStack.push( new URIMapping(sourceNode, mapping) );
902 }
903
904 private void popMappings(Node node) {
905 if(uriMappingStack.isEmpty()) return;
906 final URIMapping peek = uriMappingStack.peek();
907 if( ! DomUtils.isAncestorOf(peek.sourceNode, node) ) {
908
909 uriMappingStack.pop();
910 }
911 }
912
913
914
915
916
917
918
919
920
921 private Resource resolveNamespacedURI(String mapping, ResolutionPolicy resolutionPolicy) {
922 if(mapping.indexOf(URI_PATH_SEPARATOR) == 0) {
923 mapping = mapping.substring(1);
924 }
925
926 final int prefixSeparatorIndex = mapping.indexOf(':');
927 if(prefixSeparatorIndex == -1) {
928 if(resolutionPolicy == ResolutionPolicy.NSRequired) {
929 throw new IllegalArgumentException(
930 String.format("Invalid mapping string [%s], must declare a prefix.", mapping)
931 );
932 }
933 if (resolutionPolicy == ResolutionPolicy.TermAllowed) {
934 final URI currentVocabulary = getVocabulary();
935
936 if (currentVocabulary != null) {
937 return resolveURI(currentVocabulary.toString() + mapping);
938 }
939 }
940 return resolveURI(documentBase.toString() + mapping);
941 }
942
943 final String prefix = mapping.substring(0, prefixSeparatorIndex);
944 final URI curieMapping = getMapping(prefix);
945 if(curieMapping == null) {
946 throw new IllegalArgumentException( String.format("Cannot map prefix '%s'", prefix) );
947 }
948 final String candidateCURIEStr = curieMapping.toString() + mapping.substring(prefixSeparatorIndex + 1);
949 final java.net.URI candidateCURIE;
950 try {
951 candidateCURIE = new java.net.URI(candidateCURIEStr);
952 } catch (URISyntaxException urise) {
953 throw new IllegalArgumentException(String.format("Invalid CURIE '%s'", candidateCURIEStr) );
954 }
955 return resolveURI(
956 candidateCURIE.isAbsolute()
957 ?
958 candidateCURIE.toString()
959 :
960 documentBase.toString() + candidateCURIE.toString()
961 );
962 }
963
964
965
966
967 enum ResolutionPolicy {
968 NSNotRequired,
969 NSRequired,
970 TermAllowed
971 }
972
973
974
975
976 private class EvaluationContext {
977 private Node node;
978 private URL base;
979 private Resource parentSubject;
980 private Value parentObject;
981 private String language;
982 private boolean recourse;
983 private boolean skipElem;
984 private Resource newSubject;
985 private Resource currentObjectResource;
986
987
988
989
990
991
992 EvaluationContext(URL base) {
993 this.base = base;
994 this.parentSubject = resolveURI( base.toExternalForm() );
995 this.parentObject = null;
996 this.language = null;
997 this.recourse = true;
998 this.skipElem = false;
999 this.newSubject = null;
1000 this.currentObjectResource = null;
1001 }
1002 }
1003
1004
1005
1006
1007 private class PrefixMap {
1008 final String prefix;
1009 final URI uri;
1010 public PrefixMap(String prefix, URI uri) {
1011 this.prefix = prefix;
1012 this.uri = uri;
1013 }
1014 }
1015
1016
1017
1018
1019 private class URIMapping {
1020 final Node sourceNode;
1021 final Map<String, URI> map;
1022
1023 public URIMapping(Node sourceNode, Map<String, URI> map) {
1024 this.sourceNode = sourceNode;
1025 this.map = map;
1026 }
1027 }
1028
1029
1030
1031
1032 private enum IncompleteTripleDirection {
1033 Forward,
1034 Reverse
1035 }
1036
1037
1038
1039
1040 private class IncompleteTriple {
1041 final Node originatingNode;
1042 final Resource subject;
1043 final URI predicate;
1044 final IncompleteTripleDirection direction;
1045
1046 public IncompleteTriple(
1047 Node originatingNode,
1048 Resource subject,
1049 URI predicate,
1050 IncompleteTripleDirection direction
1051 ) {
1052 if(originatingNode == null || subject == null || predicate == null || direction == null)
1053 throw new IllegalArgumentException();
1054
1055 this.originatingNode = originatingNode;
1056 this.subject = subject;
1057 this.predicate = predicate;
1058 this.direction = direction;
1059 }
1060
1061 public boolean produceTriple(Node resourceNode, Resource r, ExtractionResult extractionResult) {
1062 if( ! DomUtils.isAncestorOf(originatingNode, resourceNode, true) ) return false;
1063
1064 if(r == null) throw new IllegalArgumentException();
1065 switch (direction) {
1066 case Forward:
1067 extractionResult.writeTriple(subject, predicate, r);
1068 break;
1069 case Reverse:
1070 extractionResult.writeTriple(r, predicate, subject);
1071 break;
1072 default:
1073 throw new IllegalStateException();
1074 }
1075 return true;
1076 }
1077 }
1078
1079
1080
1081
1082 private class Vocabulary {
1083 final Node originatingNode;
1084 final URI prefix;
1085
1086 public Vocabulary(Node originatingNode, URI prefix) {
1087 this.originatingNode = originatingNode;
1088 this.prefix = prefix;
1089 }
1090 }
1091
1092 }