View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.any23.extractor.microdata;
18  
19  import org.apache.any23.extractor.html.DomUtils;
20  import org.apache.any23.rdf.RDFUtils;
21  import org.apache.commons.lang.StringUtils;
22  import org.eclipse.rdf4j.model.IRI;
23  import org.eclipse.rdf4j.model.Literal;
24  import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil;
25  import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
26  import org.jsoup.parser.Tag;
27  import org.w3c.dom.Document;
28  import org.w3c.dom.Element;
29  import org.w3c.dom.NamedNodeMap;
30  import org.w3c.dom.Node;
31  import org.w3c.dom.NodeList;
32  import org.w3c.dom.traversal.DocumentTraversal;
33  import org.w3c.dom.traversal.NodeFilter;
34  import org.w3c.dom.traversal.TreeWalker;
35  
36  import java.io.PrintStream;
37  import java.util.ArrayList;
38  import java.util.Arrays;
39  import java.util.Collections;
40  import java.util.HashMap;
41  import java.util.HashSet;
42  import java.util.LinkedHashSet;
43  import java.util.List;
44  import java.util.Locale;
45  import java.util.Map;
46  import java.util.Set;
47  import java.util.stream.Collectors;
48  
49  /**
50   * This class provides utility methods for handling <b>Microdata</b>
51   * nodes contained within a <i>DOM</i> document.
52   *
53   * @author Michele Mostarda (mostarda@fbk.eu)
54   * @author Hans Brende (hansbrende@apache.org)
55   */
56  public class MicrodataParser {
57  
58      enum ErrorMode {
59          /** This mode raises an exception at first encountered error. */
60          STOP_AT_FIRST_ERROR,
61          /**  This mode produces a full error report. */
62          FULL_REPORT
63      }
64  
65      private final Document document;
66  
67      /**
68       * This set holds the name of properties being dereferenced.
69       * The {@link #deferProperties(String...)} checks first if the
70       * required dereference has been already asked, if so raises
71       * a loop detection error. This map works in coordination
72       * with {@link #dereferenceRecursionCounter}, so that at the end of
73       * {@link #deferProperties(String...)} call recursion the
74       * loopDetectorSet can be cleaned up.
75       */
76      private final Set<String> loopDetectorSet = new HashSet<>();
77  
78      /**
79       * {@link ItemScope} cache.
80       */
81      private final Map<Node,ItemScope> itemScopes = new HashMap<>();
82  
83      /**
84       * {@link ItemPropValue} cache.
85       */
86      private final Map<Node, ItemPropValue> itemPropValues = new HashMap<>();
87  
88     /**
89       * Counts the recursive call of {@link #deferProperties(String...)}.
90       * It helps to cleanup the {@link #loopDetectorSet} when recursion ends.
91       */
92      private int dereferenceRecursionCounter = 0;
93  
94      /**
95       * Current error mode.
96       */
97      private ErrorMode errorMode = ErrorMode.FULL_REPORT;
98  
99      /**
100      * List of collected errors. Used when {@link #errorMode} <code>==</code> {@link ErrorMode#FULL_REPORT}.
101      */
102     private final List<MicrodataParserException> errors = new ArrayList<>();
103 
104     public static final String ITEMSCOPE_ATTRIBUTE = "itemscope";
105     public static final String ITEMPROP_ATTRIBUTE  = "itemprop";
106     private static final String REVERSE_ITEMPROP_ATTRIBUTE = "itemprop-reverse";
107 
108     /**
109      * List of tags providing the <code>src</code> property.
110      */
111     public static final Set<String> SRC_TAGS =  Collections.unmodifiableSet(
112             new HashSet<String>( Arrays.asList("audio", "embed", "frame", "iframe", "img", 
113               "source", "track", "video", "input", "layer", "script", "textarea") )
114     );
115 
116     /**
117      * List of tags providing the <code>href</code> property.
118      */
119     public static final Set<String> HREF_TAGS =  Collections.unmodifiableSet(
120             new HashSet<String>( Arrays.asList("a", "area", "link") )
121     );
122 
123     public MicrodataParser(Document document) {
124       if(document == null) {
125           throw new NullPointerException("Document cannot be null.");
126       }
127       this.document = document;
128     }
129 
130     /**
131      * Returns all the <i>itemScope</i>s detected within the given root node.
132      *
133      * @param node root node to search in.
134      * @return list of detected items.
135      */
136     public static List<Node> getItemScopeNodes(Node node) {
137         return DomUtils.findAllByAttributeName(node, ITEMSCOPE_ATTRIBUTE);
138     }
139 
140     /**
141      * Check whether a node is an <i>itemScope</i>.
142      *
143      * @param node node to check.
144      * @return <code>true</code> if the node is an <i>itemScope</i>., <code>false</code> otherwise.
145      */
146     public static boolean isItemScope(Node node) {
147         return DomUtils.readAttribute(node, ITEMSCOPE_ATTRIBUTE, null) != null;
148     }
149 
150     /**
151      * Returns all the <i>itemProp</i>s detected within the given root node.
152      *
153      * @param node root node to search in.
154      * @return list of detected items.
155      */
156     public static List<Node> getItemPropNodes(Node node) {
157         return DomUtils.findAllByAttributeName(node, ITEMPROP_ATTRIBUTE);
158     }
159 
160     /**
161      * Check whether a node is an <i>itemProp</i>.
162      *
163      * @param node node to check.
164      * @return <code>true</code> if the node is an <i>itemProp</i>., <code>false</code> otherwise.
165      */
166     public static boolean isItemProp(Node node) {
167         return DomUtils.readAttribute(node, ITEMPROP_ATTRIBUTE, null) != null;
168     }
169 
170     private static boolean isContainedInItemScope(Node node) {
171         for (Node p = node.getParentNode(); p != null; p = p.getParentNode()) {
172             NamedNodeMap attrs = p.getAttributes();
173             if (attrs != null && attrs.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) {
174                 return true;
175             }
176         }
177         return false;
178     }
179 
180     private static boolean isContainedInId(Node node, Set<String> ids) {
181         do {
182             String id = DomUtils.readAttribute(node, "id", null);
183             if (id != null && ids.contains(id)) {
184                 return true;
185             }
186             node = node.getParentNode();
187         } while (node != null);
188         return false;
189     }
190 
191     /**
192      * Returns only the <i>itemScope</i>s that are top level items.
193      *
194      * @param node root node to search in.
195      * @return list of detected top item scopes.
196      */
197     public static List<Node> getTopLevelItemScopeNodes(Node node)  {
198         final List<Node> itemScopes = getItemScopeNodes(node);
199         final List<Node> topLevelItemScopes = new ArrayList<>();
200         final List<Node> possibles = new ArrayList<>();
201         for (Node itemScope : itemScopes) {
202             if (!isItemProp(itemScope)
203                     && DomUtils.readAttribute(itemScope, REVERSE_ITEMPROP_ATTRIBUTE, null) == null) {
204                 topLevelItemScopes.add(itemScope);
205             } else if (!isContainedInItemScope(itemScope)) {
206                 possibles.add(itemScope);
207             }
208         }
209 
210         if (!possibles.isEmpty()) {
211             Set<String> refIds = itemScopes.stream()
212                     .flatMap(n -> Arrays.stream(itemrefIds(n)))
213                     .collect(Collectors.toSet());
214 
215             for (Node itemScope : possibles) {
216                 if (!isContainedInId(itemScope, refIds)) {
217                     topLevelItemScopes.add(itemScope);
218                 }
219             }
220         }
221 
222         return topLevelItemScopes;
223     }
224 
225     /**
226      * Returns all the <b>Microdata items</b> detected within the given <code>document</code>.
227      *
228      * @param document document to be processed.
229      * @param errorMode error management policy.
230      * @return list of <b>itemscope</b> items.
231      * @throws MicrodataParserException if
232      *         <code>errorMode == {@link org.apache.any23.extractor.microdata.MicrodataParser.ErrorMode#STOP_AT_FIRST_ERROR}</code>
233      *         and an error occurs.
234      */
235     public static MicrodataParserReport getMicrodata(Document document, ErrorMode errorMode)
236     throws MicrodataParserException {
237         final List<Node> itemNodes = getTopLevelItemScopeNodes(document);
238         final List<ItemScope> items = new ArrayList<>();
239         final MicrodataParserataParser.html#MicrodataParser">MicrodataParser microdataParser = new MicrodataParser(document);
240         microdataParser.setErrorMode(errorMode);
241         for(Node itemNode : itemNodes) {
242             items.add( microdataParser.getItemScope(itemNode) );
243         }
244         return new MicrodataParserReport(
245                 items.toArray( new ItemScope[items.size()] ),
246                 microdataParser.getErrors()
247         );
248     }
249 
250     /**
251      * Returns all the <b>Microdata items</b> detected within the given <code>document</code>,
252      * works in full report mode.
253      *
254      * @param document document to be processed.
255      * @return list of <b>itemscope</b> items.
256      */
257     public static MicrodataParserReport getMicrodata(Document document) {
258         try {
259             return getMicrodata(document, ErrorMode.FULL_REPORT);
260         } catch (MicrodataParserException mpe) {
261              throw new IllegalStateException("Unexpected exception.", mpe);
262         }
263     }
264 
265     /**
266      * Returns a <i>JSON</i> containing the list of all extracted Microdata,
267      * as described at <a href="http://www.w3.org/TR/microdata/#json">Microdata JSON Specification</a>.
268      *
269      * @param document document to be processed.
270      * @param ps the {@link java.io.PrintStream} to write JSON to
271      */
272     public static void getMicrodataAsJSON(Document document, PrintStream ps) {
273         final MicrodataParserReport report = getMicrodata(document);
274         final ItemScope[] itemScopes = report.getDetectedItemScopes();
275         final MicrodataParserException[] errors = report.getErrors();
276 
277         ps.append("{ ");
278 
279         // Results.
280         ps.append("\"result\" : [");
281         for(int i = 0; i < itemScopes.length; i++) {
282             if (i > 0) {
283                 ps.print(", ");
284             }
285             ps.print( itemScopes[i].toJSON() );
286         }
287         ps.append("] ");
288 
289         // Errors.
290         if(errors != null && errors.length > 0) {
291             ps.append(", ");
292             ps.append("\"errors\" : [");
293             for (int i = 0; i < errors.length; i++) {
294                 if (i > 0) {
295                     ps.print(", ");
296                 }
297                 ps.print( errors[i].toJSON() );
298             }
299             ps.append("] ");
300         }
301 
302         ps.append("}");
303     }
304 
305     public void setErrorMode(ErrorMode errorMode) {
306         if(errorMode == null)
307             throw new IllegalArgumentException("errorMode must be not null.");
308         this.errorMode = errorMode;
309     }
310 
311     public ErrorMode getErrorMode() {
312         return this.errorMode;
313     }
314 
315     public MicrodataParserException[] getErrors() {
316         return errors == null
317                 ?
318                 new MicrodataParserException[0]
319                 :
320                 errors.toArray( new MicrodataParserException[errors.size()] );
321     }
322 
323     /**
324      * Reads the value of a <b>itemprop</b> node.
325      *
326      * @param node itemprop node.
327      * @return value detected within the given <code>node</code>.
328      * @throws MicrodataParserException if an error occurs while extracting a nested item scope.
329      */
330     public ItemPropValue getPropertyValue(Node node) throws MicrodataParserException {
331         final ItemPropValue itemPropValue = itemPropValues.get(node);
332         if (itemPropValue != null)
333             return itemPropValue;
334 
335         if (isItemScope(node)) {
336             return new ItemPropValue( getItemScope(node), ItemPropValue.Type.Nested);
337         }
338 
339         final String nodeName = node.getNodeName().toLowerCase();
340 
341         //see http://w3c.github.io/microdata-rdf/#dfn-property-values
342         if ("data".equals(nodeName) || "meter".equals(nodeName)) {
343             String value = value(node, "value");
344             Literal l;
345             if (XMLDatatypeUtil.isValidInteger(value)) {
346                 l = RDFUtils.literal(value, XMLSchema.INTEGER);
347             } else if (XMLDatatypeUtil.isValidDouble(value)) {
348                 l = RDFUtils.literal(value, XMLSchema.DOUBLE);
349             } else {
350                 l = RDFUtils.literal(value);
351             }
352             return new ItemPropValue(l);
353         }
354         if ("time".equals(nodeName)) {
355             String dateTimeStr = value(node, "datetime");
356             Literal l;
357             if (XMLDatatypeUtil.isValidDate(dateTimeStr)) {
358                 l = RDFUtils.literal(dateTimeStr, XMLSchema.DATE);
359             } else if (XMLDatatypeUtil.isValidTime(dateTimeStr)) {
360                 l = RDFUtils.literal(dateTimeStr, XMLSchema.TIME);
361             } else if (XMLDatatypeUtil.isValidDateTime(dateTimeStr)) {
362                 l = RDFUtils.literal(dateTimeStr, XMLSchema.DATETIME);
363             } else if (XMLDatatypeUtil.isValidGYearMonth(dateTimeStr)) {
364                 l = RDFUtils.literal(dateTimeStr, XMLSchema.GYEARMONTH);
365             } else if (XMLDatatypeUtil.isValidGYear(dateTimeStr)) {
366                 l = RDFUtils.literal(dateTimeStr, XMLSchema.GYEAR);
367             } else if (XMLDatatypeUtil.isValidDuration(dateTimeStr)) {
368                 l = RDFUtils.literal(dateTimeStr, XMLSchema.DURATION);
369             } else {
370                 l = RDFUtils.literal(dateTimeStr, getLanguage(node));
371             }
372             return new ItemPropValue(l);
373         }
374 
375         if (SRC_TAGS.contains(nodeName)) {
376             return link(node, "src");
377         }
378         if (HREF_TAGS.contains(nodeName)) {
379             return link(node, "href");
380         }
381 
382         if ("object".equals(nodeName)) {
383             return link(node, "data");
384         }
385 
386         String val = DomUtils.readAttribute(node, "content", null);
387         if (val != null) {
388             return new ItemPropValue(RDFUtils.literal(val, getLanguage(node)));
389         }
390 
391         Literal l = RDFUtils.literal(textContent(node), getLanguage(node));
392         final ItemPropValueropValue.html#ItemPropValue">ItemPropValue newItemPropValue = new ItemPropValue(l);
393         itemPropValues.put(node, newItemPropValue);
394         return newItemPropValue;
395     }
396 
397     private static String textContent(Node node) {
398         StringBuilder content = new StringBuilder();
399         appendFormatted(node, content, false);
400         return content.toString();
401     }
402 
403     private static boolean shouldSeparateWithNewline(CharSequence s0, CharSequence s1) {
404         for (int i = 0, len = s1.length(); i < len; i++) {
405             char ch = s1.charAt(i);
406             if (ch == '\n' || ch == '\r') {
407                 return false;
408             }
409             if (!Character.isWhitespace(ch)) {
410                 break;
411             }
412         }
413         for (int i = s0.length() - 1; i >= 0; i--) {
414             char ch = s0.charAt(i);
415             if (ch == '\n' || ch == '\r') {
416                 return false;
417             }
418             if (!Character.isWhitespace(ch)) {
419                 return true;
420             }
421         }
422         return false;
423     }
424 
425     private static boolean appendFormatted(Node node, StringBuilder sb, boolean needsNewline) {
426         switch (node.getNodeType()) {
427             case Node.TEXT_NODE:
428                 String text = node.getTextContent();
429                 if (text.isEmpty()) {
430                     return needsNewline;
431                 }
432                 if (needsNewline && shouldSeparateWithNewline(sb, text)) {
433                     sb.append('\n');
434                 }
435                 sb.append(text);
436                 return false;
437             case Node.ELEMENT_NODE:
438                 final String nodeName = node.getNodeName().toLowerCase(Locale.ENGLISH);
439                 final boolean thisNeedsNewline = "br".equals(nodeName) || Tag.valueOf(nodeName).isBlock();
440                 final NodeList children = node.getChildNodes();
441                 boolean prevChildNeedsNewline = needsNewline || thisNeedsNewline;
442                 for (int i = 0, len = children.getLength(); i < len; i++) {
443                     prevChildNeedsNewline = appendFormatted(children.item(i), sb, prevChildNeedsNewline);
444                 }
445                 return prevChildNeedsNewline || thisNeedsNewline;
446             default:
447                 return needsNewline;
448         }
449     }
450 
451     private static String content(Node node, String attrName) {
452         NamedNodeMap attributes = node.getAttributes();
453         if (attributes != null) {
454             Node attr = attributes.getNamedItem("content");
455             if (attr != null) {
456                 return attr.getNodeValue();
457             }
458             attr = attributes.getNamedItem(attrName);
459             if (attr != null) {
460                 return attr.getNodeValue();
461             }
462         }
463         return null;
464     }
465 
466     private static String value(Node node, String attrName) {
467         String content = content(node, attrName);
468         return StringUtils.stripToEmpty(content != null ? content : node.getTextContent());
469     }
470 
471     private static ItemPropValue link(Node node, String attrName) {
472         String content = content(node, attrName);
473         return content == null ? new ItemPropValue(RDFUtils.literal(""))
474                 : new ItemPropValue(content, ItemPropValue.Type.Link);
475     }
476 
477     //see https://www.w3.org/TR/html52/dom.html#the-lang-and-xmllang-attributes
478     private static String getLanguage(Node node) {
479         String lang;
480         do {
481             lang = DomUtils.readAttribute(node, "xml:lang", null);
482             if (StringUtils.isNotBlank(lang)) {
483                 return lang.trim();
484             }
485             lang = DomUtils.readAttribute(node, "lang", null);
486             if (StringUtils.isNotBlank(lang)) {
487                 return lang.trim();
488             }
489             node = node.getParentNode();
490         } while (node != null);
491         return null;
492     }
493 
494     /**
495      * Returns all the <b>itemprop</b>s for the given <b>itemscope</b> node.
496      *
497      * @param scopeNode node representing the <b>itemscope</b>
498      * @param skipRoot if <code>true</code> the given root <code>node</code>
499      *        will be not read as a property, even if it contains the <b>itemprop</b> attribute.
500      * @return the list of <b>itemprop</b>s detected within the given <b>itemscope</b>.
501      * @throws MicrodataParserException if an error occurs while retrieving an property value.
502      */
503     public List<ItemProp> getItemProps(final Node scopeNode, boolean skipRoot) throws MicrodataParserException {
504         final Set<Node> accepted = new LinkedHashSet<>();
505 
506         boolean skipRootChildren = false;
507         if (!skipRoot) {
508             NamedNodeMap attributes = scopeNode.getAttributes();
509             if (attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null
510                     || attributes.getNamedItem(REVERSE_ITEMPROP_ATTRIBUTE) != null) {
511                 accepted.add(scopeNode);
512             }
513             if (attributes.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) {
514                 skipRootChildren = true;
515             }
516         }
517 
518         if (!skipRootChildren) {
519             // TreeWalker to walk DOM tree starting with the scopeNode. Nodes maybe visited multiple times.
520             TreeWalker treeWalker = ((DocumentTraversal) scopeNode.getOwnerDocument())
521                     .createTreeWalker(scopeNode, NodeFilter.SHOW_ELEMENT, new NodeFilter() {
522                         @Override
523                         public short acceptNode(Node node) {
524                             if (node.getNodeType() == Node.ELEMENT_NODE) {
525                                 NamedNodeMap attributes = node.getAttributes();
526                                 if ((attributes.getNamedItem(ITEMPROP_ATTRIBUTE) != null
527                                         || attributes.getNamedItem(REVERSE_ITEMPROP_ATTRIBUTE) != null) && scopeNode != node) {
528                                     accepted.add(node);
529                                 }
530 
531                                 if (attributes.getNamedItem(ITEMSCOPE_ATTRIBUTE) != null) {
532                                     // Don't visit descendants of nodes that define a new scope
533                                     return FILTER_REJECT;
534                                 }
535                             }
536                             return FILTER_ACCEPT;
537                         }
538                     }, false);
539 
540 
541             // To populate accepted we only need to walk the tree.
542             while (treeWalker.nextNode() != null) ;
543         }
544 
545         final List<ItemProp> result = new ArrayList<>();
546         for (Node itemPropNode : accepted) {
547             final String itemProp = DomUtils.readAttribute(itemPropNode, ITEMPROP_ATTRIBUTE, null);
548             final String reverseProp = DomUtils.readAttribute(itemPropNode, REVERSE_ITEMPROP_ATTRIBUTE, null);
549 
550             boolean hasItemProp = StringUtils.isNotBlank(itemProp);
551             boolean hasReverseProp = StringUtils.isNotBlank(reverseProp);
552 
553             if (!hasItemProp && !hasReverseProp) {
554                 manageError(new MicrodataParserException("invalid property name '" + itemProp + "'", itemPropNode));
555                 continue;
556             }
557 
558             ItemPropValue itemPropValue;
559             try {
560                 itemPropValue = getPropertyValue(itemPropNode);
561             } catch (MicrodataParserException mpe) {
562                 manageError(mpe);
563                 continue;
564             }
565             if (hasItemProp) {
566                 for (String propertyName : itemProp.trim().split("\\s+")) {
567                     result.add(
568                             new ItemProp(
569                                     DomUtils.getXPathForNode(itemPropNode),
570                                     propertyName,
571                                     itemPropValue,
572                                     false
573                             )
574                     );
575                 }
576             }
577             if (hasReverseProp) {
578                 if (itemPropValue.literal != null) {
579                     manageError(new MicrodataParserException(REVERSE_ITEMPROP_ATTRIBUTE
580                             + " cannot point to a literal", itemPropNode));
581                     continue;
582                 }
583                 for (String propertyName : reverseProp.trim().split("\\s+")) {
584                     result.add(
585                             new ItemProp(
586                                     DomUtils.getXPathForNode(itemPropNode),
587                                     propertyName,
588                                     itemPropValue,
589                                     true
590                             )
591                     );
592                 }
593             }
594         }
595         return result;
596     }
597 
598     /**
599      * Given a document and a list of <b>itemprop</b> names this method will return
600      * such <b>itemprops</b>.
601      * 
602      * @param refs list of references.
603      * @return list of retrieved <b>itemprop</b>s.
604      * @throws MicrodataParserException if a loop is detected or a property name is missing.
605      */
606     public ItemProp[] deferProperties(String... refs) throws MicrodataParserException {
607         Document document = this.document;
608         dereferenceRecursionCounter++;
609         final List<ItemProp> result = new ArrayList<>();
610         try {
611             for (String ref : refs) {
612                 if (loopDetectorSet.contains(ref)) {
613                         throw new MicrodataParserException(
614                                 String.format(
615                                         "Loop detected with depth %d while dereferencing itemProp '%s' .",
616                                         dereferenceRecursionCounter - 1, ref
617                                 ),
618                                 null
619                         );
620                 }
621                 loopDetectorSet.add(ref);
622                 Element element = document.getElementById(ref);
623                 if (element == null) {
624                     manageError(
625                             new MicrodataParserException( String.format("Unknown itemProp id '%s'", ref ), null )
626                     );
627                     continue;
628                 }
629                 result.addAll(getItemProps(element, false));
630             }
631         } catch (MicrodataParserException mpe) {
632             if(dereferenceRecursionCounter == 1)
633                 manageError(mpe);
634             else throw mpe;  // Recursion end, this the the top call.
635         } finally {
636             dereferenceRecursionCounter--;
637             if(dereferenceRecursionCounter == 0) { // Recursion end, this the the top call.
638                 loopDetectorSet.clear();
639             }
640         }
641         return result.toArray( new ItemProp[result.size()] );
642     }
643 
644     private static final String[] EMPTY_STRINGS = new String[0];
645     private static String[] itemrefIds(Node node) {
646         String itemref = DomUtils.readAttribute(node, "itemref" , null);
647         return StringUtils.isBlank(itemref) ? EMPTY_STRINGS : itemref.trim().split("\\s+");
648     }
649 
650     /**
651      * Returns the {@link ItemScope} instance described within the specified <code>node</code>.
652      *
653      * @param node node describing an <i>itemscope</i>.
654      * @return instance of ItemScope object.
655      * @throws MicrodataParserException if an error occurs while dereferencing properties.
656      */
657     public ItemScope getItemScope(Node node) throws MicrodataParserException {
658         final ItemScope itemScope = itemScopes.get(node);
659         if(itemScope != null)
660             return itemScope;
661 
662         final String id       = DomUtils.readAttribute(node, "id"      , null);
663         final String itemType = DomUtils.readAttribute(node, "itemtype", null);
664         final String itemId   = DomUtils.readAttribute(node, "itemid"  , null);
665 
666         final List<ItemProp> itemProps = getItemProps(node, true);
667         final String[] itemrefIDs = itemrefIds(node);
668         final ItemProp[] deferredProperties;
669         try {
670             deferredProperties = deferProperties(itemrefIDs);
671         } catch (MicrodataParserException mpe) {
672             mpe.setErrorNode(node);
673             throw mpe;
674         }
675         for(ItemProp deferredProperty : deferredProperties) {
676             if( itemProps.contains(deferredProperty) ) {
677                 manageError(
678                         new MicrodataParserException(
679                             String.format("Duplicated deferred itemProp '%s'.", deferredProperty.getName() ),
680                             node
681                         )
682                 );
683                 continue;
684             }
685             itemProps.add(deferredProperty);
686         }
687 
688         List<IRI> types;
689         if (itemType == null) {
690             types = Collections.emptyList();
691         } else {
692             types = new ArrayList<>();
693             boolean canConcatWithPrev = false;
694             for (String s : itemType.trim().split("\\s+")) {
695                 try {
696                     canConcatWithPrev = types.addAll(ItemScope.stringToSingletonIRI(s));
697                 } catch (RuntimeException e) {
698                     if (canConcatWithPrev) {
699                         int lastInd = types.size() - 1;
700                         try {
701                             List<IRI> secondTry = ItemScope.stringToSingletonIRI(types.get(lastInd).stringValue() + " " + s);
702                             types.remove(lastInd);
703                             canConcatWithPrev = types.addAll(secondTry);
704                         } catch (RuntimeException e2) {
705                             manageError(new MicrodataParserException(e.getMessage(), node));
706                             canConcatWithPrev = false;
707                         }
708                     } else {
709                         manageError(new MicrodataParserException(e.getMessage(), node));
710                     }
711                 }
712             }
713         }
714 
715         final ItemScopeta/ItemScope.html#ItemScope">ItemScope newItemScope = new ItemScope(
716                 DomUtils.getXPathForNode(node),
717                 itemProps.toArray(new ItemProp[itemProps.size()]),
718                 id,
719                 itemrefIDs,
720                 types,
721                 itemId
722         );
723         itemScopes.put(node, newItemScope);
724         return newItemScope;
725     }
726 
727     private void manageError(MicrodataParserExceptiondataParserException.html#MicrodataParserException">MicrodataParserException mpe) throws MicrodataParserException {
728         switch (errorMode) {
729             case FULL_REPORT:
730                 errors.add(mpe);
731                 break;
732             case STOP_AT_FIRST_ERROR:
733                 throw mpe;
734             default:
735                 throw new IllegalStateException("Unsupported mode " + errorMode);
736         }
737     }
738 
739 }