View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.microdata;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.Extractor;
25  import org.apache.any23.extractor.ExtractorDescription;
26  import org.apache.any23.extractor.IssueReport;
27  import org.apache.any23.rdf.RDFUtils;
28  import org.eclipse.rdf4j.common.net.ParsedIRI;
29  import org.eclipse.rdf4j.model.IRI;
30  import org.eclipse.rdf4j.model.Resource;
31  import org.eclipse.rdf4j.model.Value;
32  import org.eclipse.rdf4j.model.vocabulary.RDF;
33  import org.w3c.dom.Document;
34  
35  import java.io.IOException;
36  import java.net.URISyntaxException;
37  import java.util.HashMap;
38  import java.util.List;
39  import java.util.Map;
40  import java.util.Optional;
41  
42  /**
43   * Default implementation of <a href="https://www.w3.org/TR/microdata/">Microdata</a> extractor,
44   * based on {@link org.apache.any23.extractor.Extractor.TagSoupDOMExtractor}.
45   *
46   * @author Michele Mostarda (mostarda@fbk.eu)
47   * @author Davide Palmisano ( dpalmisano@gmail.com )
48   * @author Hans Brende (hansbrende@apache.org)
49   */
50  public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
51  
52      static final IRI MICRODATA_ITEM
53              = RDFUtils.iri("http://www.w3.org/1999/xhtml/microdata#item");
54  
55      private static final ParsedIRI EMPTY_FRAG = ParsedIRI.create("#");
56  
57      @Override
58      public ExtractorDescription getDescription() {
59          return MicrodataExtractorFactory.getDescriptionInstance();
60      }
61  
62      /**
63       * This extraction performs the
64       * <a href="https://www.w3.org/TR/microdata-rdf/">Microdata to RDF conversion algorithm</a>.
65       */
66      @Override
67      public void run(
68              ExtractionParameters extractionParameters,
69              ExtractionContext extractionContext,
70              Document in,
71              ExtractionResult out
72      ) throws IOException, ExtractionException {
73  
74          final MicrodataParserReport parserReport = MicrodataParser.getMicrodata(in);
75          if(parserReport.getErrors().length > 0) {
76              notifyError(parserReport.getErrors(), out);
77          }
78          final ItemScope[] itemScopes = parserReport.getDetectedItemScopes();
79          if (itemScopes.length == 0) {
80              return;
81          }
82  
83          final IRI documentIRI = extractionContext.getDocumentIRI();
84          final ParsedIRI parsedDocumentIRI = ParsedIRI.create(documentIRI.stringValue());
85  
86          boolean isStrict = extractionParameters.getFlag("any23.microdata.strict");
87          final IRI defaultNamespace;
88          if (!isStrict) {
89              defaultNamespace = RDFUtils.iri(extractionParameters.getProperty("any23.microdata.ns.default"));
90              if (!defaultNamespace.getLocalName().isEmpty()) {
91                  throw new IllegalArgumentException("invalid namespace IRI: " + defaultNamespace);
92              }
93          } else {
94              //TODO: incorporate document's "base" element
95              defaultNamespace = RDFUtils.iri(parsedDocumentIRI.resolve(EMPTY_FRAG).toString());
96          }
97  
98          //https://www.w3.org/TR/microdata-rdf/#generate-the-triples
99          final Map<ItemScope, Resource> mappings = new HashMap<>();
100         for (ItemScope itemScope : itemScopes) {
101             Resource subject = processType(itemScope, parsedDocumentIRI, out, mappings, defaultNamespace);
102 
103             //Writing out md:item triple has been removed from spec
104             //but for now, keep for backwards compatibility.
105             out.writeTriple(
106                     documentIRI,
107                     MICRODATA_ITEM,
108                     subject
109             );
110         }
111     }
112 
113     /**
114      * Recursive method implementing 6.3 "generate the triples" of the
115      * <a href="https://www.w3.org/TR/microdata-rdf/#generate-the-triples">Microdata to RDF</a>
116      * extraction algorithm.
117      */
118     private Resource processType(
119             ItemScope itemScope,
120             ParsedIRI documentIRI, ExtractionResult out,
121             Map<ItemScope, Resource> mappings, IRI defaultNamespace
122     ) throws ExtractionException {
123         Resource subject = mappings.computeIfAbsent(itemScope, scope ->
124                 createSubjectForItemId(documentIRI, scope.getItemId()));
125 
126         List<IRI> itemScopeTypes = itemScope.getTypes();
127         if (!itemScopeTypes.isEmpty()) {
128             defaultNamespace = getNamespaceIRI(itemScopeTypes.get(0));
129             for (IRI type : itemScopeTypes) {
130                 out.writeTriple(subject, RDF.TYPE, type);
131             }
132         }
133         for (Map.Entry<String, List<ItemProp>> itemProps : itemScope.getProperties().entrySet()) {
134             String propName = itemProps.getKey();
135             IRI predicate = getPredicate(defaultNamespace, propName);
136             if (predicate == null) {
137                 continue;
138             }
139             for (ItemProp itemProp : itemProps.getValue()) {
140                 try {
141                     processProperty(
142                             subject,
143                             predicate,
144                             itemProp,
145                             documentIRI,
146                             mappings,
147                             out,
148                             defaultNamespace
149                     );
150                 } catch (URISyntaxException e) {
151                     throw new ExtractionException(
152                             "Error while processing on subject '" + subject +
153                                     "' the itemProp: '" + itemProp + "' "
154                     );
155                 }
156             }
157         }
158         return subject;
159     }
160 
161     private static Resource createSubjectForItemId(ParsedIRI documentIRI, String itemId) {
162         if (itemId == null) {
163             return RDFUtils.bnode();
164         }
165         try {
166             return toAbsoluteIRI(documentIRI, itemId);
167         } catch (URISyntaxException e) {
168             return RDFUtils.bnode();
169         }
170     }
171 
172     private void processProperty(
173             Resource subject,
174             IRI predicate,
175             ItemProp itemProp,
176             ParsedIRI documentIRI,
177             Map<ItemScope, Resource> mappings,
178             ExtractionResult out,
179             IRI defaultNamespace
180     ) throws URISyntaxException, ExtractionException {
181 
182         Value value;
183         Object propValue = itemProp.getValue().getContent();
184         ItemPropValue.Type propType = itemProp.getValue().getType();
185         if (itemProp.getValue().literal != null) {
186             value = itemProp.getValue().literal;
187         } else if (propType.equals(ItemPropValue.Type.Nested)) {
188             value = processType((ItemScope) propValue, documentIRI, out, mappings, defaultNamespace);
189         } else if (propType.equals(ItemPropValue.Type.Link)) {
190             value = toAbsoluteIRI(documentIRI, (String)propValue);
191             //TODO: support registries so hardcoding not needed
192             if (predicate.stringValue().equals("http://schema.org/additionalType")) {
193                 if (itemProp.reverse) {
194                     out.writeTriple((Resource)value, RDF.TYPE, subject);
195                 } else {
196                     out.writeTriple(subject, RDF.TYPE, value);
197                 }
198             }
199         } else {
200             throw new RuntimeException("Invalid Type '" +
201                     propType + "' for ItemPropValue with name: '" + predicate + "'");
202         }
203         if (itemProp.reverse) {
204             out.writeTriple((Resource)value, predicate, subject);
205         } else {
206             out.writeTriple(subject, predicate, value);
207         }
208     }
209 
210     private static final String hcardPrefix    = "http://microformats.org/profile/hcard";
211     private static final IRI hcardNamespaceIRI = RDFUtils.iri("http://microformats.org/profile/hcard#");
212 
213     private static IRI getNamespaceIRI(IRI itemType) {
214         //TODO: support registries so hardcoding not needed
215         return itemType.stringValue().startsWith(hcardPrefix) ? hcardNamespaceIRI : itemType;
216     }
217 
218     private static IRI getPredicate(IRI namespaceIRI, String localName) {
219         return toAbsoluteIRI(localName).orElseGet(() -> namespaceIRI == null ? null :
220                 RDFUtils.iri(namespaceIRI.getNamespace(), localName.trim()));
221     }
222 
223     private static Optional<IRI> toAbsoluteIRI(String urlString) {
224         if (urlString != null) {
225             try {
226                 ParsedIRI iri = ParsedIRI.create(urlString.trim());
227                 if (iri.isAbsolute()) {
228                     return Optional.of(RDFUtils.iri(iri.toString()));
229                 }
230             } catch (RuntimeException e) {
231                 //not an absolute iri
232             }
233         }
234         return Optional.empty();
235     }
236 
237     private static IRI toAbsoluteIRI(ParsedIRI documentIRI, String part) throws URISyntaxException {
238         try {
239             return RDFUtils.iri(documentIRI.resolve(part.trim()));
240         } catch (RuntimeException e) {
241             if (e.getCause() instanceof URISyntaxException) {
242                 throw (URISyntaxException)e.getCause();
243             } else {
244                 throw new URISyntaxException(String.valueOf(part), e.getClass().getName()
245                         + (e.getMessage() != null ? ": " + e.getMessage() : ""));
246             }
247         }
248     }
249 
250     private void notifyError(MicrodataParserException[] errors, ExtractionResult out) {
251         for(MicrodataParserException mpe : errors) {
252             out.notifyIssue(
253                     IssueReport.IssueLevel.ERROR,
254                     mpe.toJSON(),
255                     mpe.getErrorLocationBeginRow(),
256                     mpe.getErrorLocationBeginCol()
257             );
258         }
259     }
260 
261 }