View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.microdata;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.Extractor;
25  import org.apache.any23.extractor.ExtractorDescription;
26  import org.apache.any23.extractor.IssueReport;
27  import org.apache.any23.rdf.RDFUtils;
28  import org.eclipse.rdf4j.common.net.ParsedIRI;
29  import org.eclipse.rdf4j.model.IRI;
30  import org.eclipse.rdf4j.model.Resource;
31  import org.eclipse.rdf4j.model.Value;
32  import org.eclipse.rdf4j.model.vocabulary.RDF;
33  import org.w3c.dom.Document;
34  
35  import java.io.IOException;
36  import java.net.URISyntaxException;
37  import java.util.HashMap;
38  import java.util.List;
39  import java.util.Map;
40  import java.util.Optional;
41  
42  /**
43   * Default implementation of <a href="https://www.w3.org/TR/microdata/">Microdata</a> extractor, based on
44   * {@link org.apache.any23.extractor.Extractor.TagSoupDOMExtractor}.
45   *
46   * @author Michele Mostarda (mostarda@fbk.eu)
47   * @author Davide Palmisano ( dpalmisano@gmail.com )
48   * @author Hans Brende (hansbrende@apache.org)
49   */
50  public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
51  
52      static final IRI MICRODATA_ITEM = RDFUtils.iri("http://www.w3.org/1999/xhtml/microdata#item");
53  
54      private static final ParsedIRI EMPTY_FRAG = ParsedIRI.create("#");
55  
56      @Override
57      public ExtractorDescription getDescription() {
58          return MicrodataExtractorFactory.getDescriptionInstance();
59      }
60  
61      /**
62       * This extraction performs the <a href="https://www.w3.org/TR/microdata-rdf/">Microdata to RDF conversion
63       * algorithm</a>.
64       */
65      @Override
66      public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
67              ExtractionResult out) throws IOException, ExtractionException {
68  
69          final MicrodataParserReport parserReport = MicrodataParser.getMicrodata(in);
70          if (parserReport.getErrors().length > 0) {
71              notifyError(parserReport.getErrors(), out);
72          }
73          final ItemScope[] itemScopes = parserReport.getDetectedItemScopes();
74          if (itemScopes.length == 0) {
75              return;
76          }
77  
78          final IRI documentIRI = extractionContext.getDocumentIRI();
79          final ParsedIRI parsedDocumentIRI = ParsedIRI.create(documentIRI.stringValue());
80  
81          boolean isStrict = extractionParameters.getFlag("any23.microdata.strict");
82          final IRI defaultNamespace;
83          if (!isStrict) {
84              defaultNamespace = RDFUtils.iri(extractionParameters.getProperty("any23.microdata.ns.default"));
85              if (!defaultNamespace.getLocalName().isEmpty()) {
86                  throw new IllegalArgumentException("invalid namespace IRI: " + defaultNamespace);
87              }
88          } else {
89              // TODO: incorporate document's "base" element
90              defaultNamespace = RDFUtils.iri(parsedDocumentIRI.resolve(EMPTY_FRAG).toString());
91          }
92  
93          // https://www.w3.org/TR/microdata-rdf/#generate-the-triples
94          final Map<ItemScope, Resource> mappings = new HashMap<>();
95          for (ItemScope itemScope : itemScopes) {
96              Resource subject = processType(itemScope, parsedDocumentIRI, out, mappings, defaultNamespace);
97  
98              // Writing out md:item triple has been removed from spec
99              // but for now, keep for backwards compatibility.
100             out.writeTriple(documentIRI, MICRODATA_ITEM, subject);
101         }
102     }
103 
104     /**
105      * Recursive method implementing 6.3 "generate the triples" of the
106      * <a href="https://www.w3.org/TR/microdata-rdf/#generate-the-triples">Microdata to RDF</a> extraction algorithm.
107      */
108     private Resource processType(ItemScope itemScope, ParsedIRI documentIRI, ExtractionResult out,
109             Map<ItemScope, Resource> mappings, IRI defaultNamespace) throws ExtractionException {
110         Resource subject = mappings.computeIfAbsent(itemScope,
111                 scope -> createSubjectForItemId(documentIRI, scope.getItemId()));
112 
113         List<IRI> itemScopeTypes = itemScope.getTypes();
114         if (!itemScopeTypes.isEmpty()) {
115             defaultNamespace = getNamespaceIRI(itemScopeTypes.get(0));
116             for (IRI type : itemScopeTypes) {
117                 out.writeTriple(subject, RDF.TYPE, type);
118             }
119         }
120         for (Map.Entry<String, List<ItemProp>> itemProps : itemScope.getProperties().entrySet()) {
121             String propName = itemProps.getKey();
122             IRI predicate = getPredicate(defaultNamespace, propName);
123             if (predicate == null) {
124                 continue;
125             }
126             for (ItemProp itemProp : itemProps.getValue()) {
127                 try {
128                     processProperty(subject, predicate, itemProp, documentIRI, mappings, out, defaultNamespace);
129                 } catch (URISyntaxException e) {
130                     throw new ExtractionException(
131                             "Error while processing on subject '" + subject + "' the itemProp: '" + itemProp + "' ");
132                 }
133             }
134         }
135         return subject;
136     }
137 
138     private static Resource createSubjectForItemId(ParsedIRI documentIRI, String itemId) {
139         if (itemId == null) {
140             return RDFUtils.bnode();
141         }
142         try {
143             return toAbsoluteIRI(documentIRI, itemId);
144         } catch (URISyntaxException e) {
145             return RDFUtils.bnode();
146         }
147     }
148 
149     private void processProperty(Resource subject, IRI predicate, ItemProp itemProp, ParsedIRI documentIRI,
150             Map<ItemScope, Resource> mappings, ExtractionResult out, IRI defaultNamespace)
151             throws URISyntaxException, ExtractionException {
152 
153         Value value;
154         Object propValue = itemProp.getValue().getContent();
155         ItemPropValue.Type propType = itemProp.getValue().getType();
156         if (itemProp.getValue().literal != null) {
157             value = itemProp.getValue().literal;
158         } else if (propType.equals(ItemPropValue.Type.Nested)) {
159             value = processType((ItemScope) propValue, documentIRI, out, mappings, defaultNamespace);
160         } else if (propType.equals(ItemPropValue.Type.Link)) {
161             value = toAbsoluteIRI(documentIRI, (String) propValue);
162             // TODO: support registries so hardcoding not needed
163             if (predicate.stringValue().equals("http://schema.org/additionalType")) {
164                 if (itemProp.reverse) {
165                     out.writeTriple((Resource) value, RDF.TYPE, subject);
166                 } else {
167                     out.writeTriple(subject, RDF.TYPE, value);
168                 }
169             }
170         } else {
171             throw new RuntimeException(
172                     "Invalid Type '" + propType + "' for ItemPropValue with name: '" + predicate + "'");
173         }
174         if (itemProp.reverse) {
175             out.writeTriple((Resource) value, predicate, subject);
176         } else {
177             out.writeTriple(subject, predicate, value);
178         }
179     }
180 
181     private static final String hcardPrefix = "http://microformats.org/profile/hcard";
182     private static final IRI hcardNamespaceIRI = RDFUtils.iri("http://microformats.org/profile/hcard#");
183 
184     private static IRI getNamespaceIRI(IRI itemType) {
185         // TODO: support registries so hardcoding not needed
186         return itemType.stringValue().startsWith(hcardPrefix) ? hcardNamespaceIRI : itemType;
187     }
188 
189     private static IRI getPredicate(IRI namespaceIRI, String localName) {
190         return toAbsoluteIRI(localName).orElseGet(
191                 () -> namespaceIRI == null ? null : RDFUtils.iri(namespaceIRI.getNamespace(), localName.trim()));
192     }
193 
194     private static Optional<IRI> toAbsoluteIRI(String urlString) {
195         if (urlString != null) {
196             try {
197                 ParsedIRI iri = ParsedIRI.create(urlString.trim());
198                 if (iri.isAbsolute()) {
199                     return Optional.of(RDFUtils.iri(iri.toString()));
200                 }
201             } catch (RuntimeException e) {
202                 // not an absolute iri
203             }
204         }
205         return Optional.empty();
206     }
207 
208     private static IRI toAbsoluteIRI(ParsedIRI documentIRI, String part) throws URISyntaxException {
209         try {
210             return RDFUtils.iri(documentIRI.resolve(part.trim()));
211         } catch (RuntimeException e) {
212             if (e.getCause() instanceof URISyntaxException) {
213                 throw (URISyntaxException) e.getCause();
214             } else {
215                 throw new URISyntaxException(String.valueOf(part),
216                         e.getClass().getName() + (e.getMessage() != null ? ": " + e.getMessage() : ""));
217             }
218         }
219     }
220 
221     private void notifyError(MicrodataParserException[] errors, ExtractionResult out) {
222         for (MicrodataParserException mpe : errors) {
223             out.notifyIssue(IssueReport.IssueLevel.ERROR, mpe.toJSON(), mpe.getErrorLocationBeginRow(),
224                     mpe.getErrorLocationBeginCol());
225         }
226     }
227 
228 }