View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.microdata;
19  
20  import org.apache.any23.extractor.IssueReport;
21  import org.apache.any23.extractor.ExtractionContext;
22  import org.apache.any23.extractor.ExtractionException;
23  import org.apache.any23.extractor.ExtractionParameters;
24  import org.apache.any23.extractor.ExtractionResult;
25  import org.apache.any23.extractor.Extractor;
26  import org.apache.any23.extractor.ExtractorDescription;
27  import org.apache.any23.extractor.ExtractorFactory;
28  import org.apache.any23.extractor.SimpleExtractorFactory;
29  import org.apache.any23.extractor.html.DomUtils;
30  import org.apache.any23.rdf.PopularPrefixes;
31  import org.apache.any23.rdf.RDFUtils;
32  import org.apache.any23.vocab.DCTERMS;
33  import org.apache.any23.vocab.XHTML;
34  import org.openrdf.model.Literal;
35  import org.openrdf.model.Resource;
36  import org.openrdf.model.URI;
37  import org.openrdf.model.Value;
38  import org.openrdf.model.vocabulary.RDF;
39  import org.openrdf.model.vocabulary.XMLSchema;
40  import org.w3c.dom.Document;
41  import org.w3c.dom.Node;
42  import org.w3c.dom.NodeList;
43  
44  import java.io.IOException;
45  import java.net.MalformedURLException;
46  import java.net.URL;
47  import java.util.Arrays;
48  import java.util.Date;
49  import java.util.HashMap;
50  import java.util.HashSet;
51  import java.util.List;
52  import java.util.Map;
53  import java.util.Set;
54  
55  /**
56   * Default implementation of <a href="http://www.w3.org/TR/microdata/">Microdata</a> extractor,
57   * based on {@link org.apache.any23.extractor.Extractor.TagSoupDOMExtractor}.
58   *
59   * @author Michele Mostarda (mostarda@fbk.eu)
60   * @author Davide Palmisano ( dpalmisano@gmail.com )
61   */
62  public class MicrodataExtractor implements Extractor.TagSoupDOMExtractor {
63  
64      private static final URI MICRODATA_ITEM
65              = RDFUtils.uri("http://www.w3.org/1999/xhtml/microdata#item");
66  
67      public final static ExtractorFactory<MicrodataExtractor> factory =
68              SimpleExtractorFactory.create(
69                      "html-microdata",
70                      PopularPrefixes.createSubset("rdf", "doac", "foaf"),
71                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
72                      "example-microdata.html",
73                      MicrodataExtractor.class
74              );
75  
76      private String documentLanguage;
77  
78      private boolean isStrict;
79  
80      private String defaultNamespace;
81  
82      public ExtractorDescription getDescription() {
83          return factory;
84      }
85  
86      /**
87       * This extraction performs the
88       * <a href="http://www.w3.org/TR/microdata/#rdf">Microdata to RDF conversion algorithm</a>.
89       * A slight modification of the specification algorithm has been introduced
90       * to avoid performing actions 5.2.1, 5.2.2, 5.2.3, 5.2.4 if step 5.2.6 doesn't detect any
91       * Microdata.
92       */
93      public void run(
94              ExtractionParameters extractionParameters,
95              ExtractionContext extractionContext,
96              Document in,
97              ExtractionResult out
98      ) throws IOException, ExtractionException {
99  
100         final MicrodataParserReport parserReport = MicrodataParser.getMicrodata(in);
101         if(parserReport.getErrors().length > 0) {
102             notifyError(parserReport.getErrors(), out);
103         }
104         final ItemScope[] itemScopes = parserReport.getDetectedItemScopes();
105         if (itemScopes.length == 0) {
106             return;
107         }
108 
109         isStrict = extractionParameters.getFlag("any23.microdata.strict");
110         if (!isStrict) {
111             defaultNamespace = extractionParameters.getProperty("any23.microdata.ns.default");
112         }
113 
114         documentLanguage = getDocumentLanguage(in);
115 
116         /**
117          * 5.2.6
118          */
119         final URI documentURI = extractionContext.getDocumentURI();
120         final Map<ItemScope, Resource> mappings = new HashMap<ItemScope, Resource>();
121         for (ItemScope itemScope : itemScopes) {
122             Resource subject = processType(itemScope, documentURI, out, mappings);
123             out.writeTriple(
124                     documentURI,
125                     MICRODATA_ITEM,
126                     subject
127             );
128         }
129 
130         /**
131          * 5.2.1
132          */
133         processTitle(in, documentURI, out);
134         /**
135          * 5.2.2
136          */
137         processHREFElements(in, documentURI, out);
138         /**
139          * 5.2.3
140          */
141         processMetaElements(in, documentURI, out);
142 
143         /**
144          * 5.2.4
145          */
146         processCiteElements(in, documentURI, out);
147     }
148 
149     /**
150      * Returns the {@link Document} language if declared, <code>null</code> otherwise.
151      *
152      * @param in a instance of {@link Document}.
153      * @return the language declared, could be <code>null</code>.
154      */
155     private String getDocumentLanguage(Document in) {
156         String lang = DomUtils.find(in, "string(/HTML/@lang)");
157         if (lang.equals("")) {
158             return null;
159         }
160         return lang;
161     }
162 
163     /**
164      * Returns the {@link Node} language if declared, or the {@link Document} one
165      * if not defined.
166      *
167      * @param node a {@link Node} instance.
168      * @return the {@link Node} language or the {@link Document} one. Could be <code>null</code>
169      */
170     private String getLanguage(Node node) {
171         Node nodeLang = node.getAttributes().getNamedItem("lang");
172         if (nodeLang == null) {
173             // if the element does not specify a lang, use the document one
174             return documentLanguage;
175         }
176         return nodeLang.getTextContent();
177     }
178 
179     /**
180      * Implements step 5.2.1 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
181      * extraction algorithm.
182      *
183      * @param in          {@link Document} to be processed.
184      * @param documentURI Document current {@link URI}.
185      * @param out         a valid not <code>null</code> {@link ExtractionResult}
186      */
187     private void processTitle(Document in, URI documentURI, ExtractionResult out) {
188         NodeList titles = in.getElementsByTagName("title");
189         // just one title is allowed.
190         if (titles.getLength() == 1) {
191             Node title = titles.item(0);
192             String titleValue = title.getTextContent();
193             Literal object;
194             String lang = getLanguage(title);
195             if (lang == null) {
196                 // unable to decide the language, leave it unknown
197                 object = RDFUtils.literal(titleValue);
198             } else {
199                 object = RDFUtils.literal(titleValue, lang);
200             }
201             out.writeTriple(
202                     documentURI,
203                     DCTERMS.getInstance().title,
204                     object
205             );
206         }
207     }
208 
209     /**
210      * Implements step 5.2.2 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
211      * extraction algorithm.
212      *
213      * @param in          {@link Document} to be processed.
214      * @param documentURI Document current {@link URI}.
215      * @param out         a valid not <code>null</code> {@link ExtractionResult}
216      */
217     private void processHREFElements(Document in, URI documentURI, ExtractionResult out) {
218         NodeList anchors = in.getElementsByTagName("a");
219         for (int i = 0; i < anchors.getLength(); i++) {
220             processHREFElement(anchors.item(i), documentURI, out);
221         }
222         NodeList areas = in.getElementsByTagName("area");
223         for (int i = 0; i < areas.getLength(); i++) {
224             processHREFElement(areas.item(i), documentURI, out);
225         }
226         NodeList links = in.getElementsByTagName("link");
227         for (int i = 0; i < links.getLength(); i++) {
228             processHREFElement(links.item(i), documentURI, out);
229         }
230     }
231 
232     /**
233      * Implements sub-step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
234      * extraction algorithm.
235      *
236      * @param item        {@link Node} to be processed.
237      * @param documentURI Document current {@link URI}.
238      * @param out         a valid not <code>null</code> {@link ExtractionResult}
239      */
240     private void processHREFElement(Node item, URI documentURI, ExtractionResult out) {
241         Node rel = item.getAttributes().getNamedItem("rel");
242         if (rel == null) {
243             return;
244         }
245         Node href = item.getAttributes().getNamedItem("href");
246         if (href == null) {
247             return;
248         }
249         URL absoluteURL;
250         if (!isAbsoluteURL(href.getTextContent())) {
251             try {
252                 absoluteURL = toAbsoluteURL(
253                         documentURI.toString(),
254                         href.getTextContent(),
255                         '/'
256                 );
257             } catch (MalformedURLException e) {
258                 // okay, it's not an absolute URL, return
259                 return;
260             }
261         } else {
262             try {
263                 absoluteURL = new URL(href.getTextContent());
264             } catch (MalformedURLException e) {
265                 // cannot happen
266                 return;
267             }
268         }
269         String[] relTokens = rel.getTextContent().split(" ");
270         Set<String> tokensWithNoDuplicates = new HashSet<String>();
271         for (String relToken : relTokens) {
272             if (relToken.contains(":")) {
273                 // if contain semi-colon, skip
274                 continue;
275             }
276             if (relToken.equals("alternate") || relToken.equals("stylesheet")) {
277                 tokensWithNoDuplicates.add("ALTERNATE-STYLESHEET");
278                 continue;
279             }
280             tokensWithNoDuplicates.add(relToken.toLowerCase());
281         }
282         for (String token : tokensWithNoDuplicates) {
283             URI predicate;
284             if (isAbsoluteURL(token)) {
285                 predicate = RDFUtils.uri(token);
286             } else {
287                 predicate = RDFUtils.uri(XHTML.NS + token);
288             }
289             out.writeTriple(
290                     documentURI,
291                     predicate,
292                     RDFUtils.uri(absoluteURL.toString())
293             );
294         }
295     }
296 
297     /**
298      * Implements step 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
299      * extraction algorithm.
300      *
301      * @param in          {@link Document} to be processed.
302      * @param documentURI Document current {@link URI}.
303      * @param out         a valid not <code>null</code> {@link ExtractionResult}
304      */
305     private void processMetaElements(Document in, URI documentURI, ExtractionResult out) {
306         NodeList metas = in.getElementsByTagName("meta");
307         for (int i = 0; i < metas.getLength(); i++) {
308             Node meta = metas.item(i);
309             String name    = DomUtils.readAttribute(meta, "name"   , null);
310             String content = DomUtils.readAttribute(meta, "content", null);
311             if (name != null && content != null) {
312                 if (isAbsoluteURL(name)) {
313                     processMetaElement(
314                             RDFUtils.uri(name),
315                             content,
316                             getLanguage(meta),
317                             documentURI,
318                             out
319                     );
320                 } else {
321                     processMetaElement(
322                             name,
323                             content,
324                             getLanguage(meta),
325                             documentURI,
326                             out
327                     );
328                 }
329             }
330         }
331     }
332 
333     /**
334      * Implements sub step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
335      * extraction algorithm.
336      *
337      * @param uri
338      * @param content
339      * @param language
340      * @param documentURI
341      * @param out
342      */
343     private void processMetaElement(
344             URI uri,
345             String content,
346             String language,
347             URI documentURI,
348             ExtractionResult out
349     ) {
350         if (content.contains(":")) {
351             // if it contains U+003A COLON, exit
352             return;
353         }
354         Literal subject;
355         if (language == null) {
356             // ok, we don't know the language
357             subject = RDFUtils.literal(content);
358         } else {
359             subject = RDFUtils.literal(content, language);
360         }
361         out.writeTriple(
362                 documentURI,
363                 uri,
364                 subject
365         );
366     }
367 
368     /**
369      * Implements sub step for 5.2.3 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
370      * extraction algorithm.
371      *
372      * @param name
373      * @param content
374      * @param language
375      * @param documentURI
376      * @param out
377      */
378     private void processMetaElement(
379             String name,
380             String content,
381             String language,
382             URI documentURI,
383             ExtractionResult out) {
384         Literal subject;
385         if (language == null) {
386             // ok, we don't know the language
387             subject = RDFUtils.literal(content);
388         } else {
389             subject = RDFUtils.literal(content, language);
390         }
391         out.writeTriple(
392                 documentURI,
393                 RDFUtils.uri(XHTML.NS + name.toLowerCase()),
394                 subject
395         );
396     }
397 
398     /**
399      * Implements sub step for 5.2.4 of <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
400      * extraction algorithm.
401      *
402      * @param in
403      * @param documentURI
404      * @param out
405      */
406     private void processCiteElements(Document in, URI documentURI, ExtractionResult out) {
407         NodeList blockQuotes = in.getElementsByTagName("blockquote");
408         for (int i = 0; i < blockQuotes.getLength(); i++) {
409             processCiteElement(blockQuotes.item(i), documentURI, out);
410         }
411         NodeList quotes = in.getElementsByTagName("q");
412         for (int i = 0; i < quotes.getLength(); i++) {
413             processCiteElement(quotes.item(i), documentURI, out);
414         }
415     }
416 
417     private void processCiteElement(Node item, URI documentURI, ExtractionResult out) {
418         if (item.getAttributes().getNamedItem("cite") != null) {
419             out.writeTriple(
420                     documentURI,
421                     DCTERMS.getInstance().source,
422                     RDFUtils.uri(item.getAttributes().getNamedItem("cite").getTextContent())
423             );
424         }
425     }
426 
427     /**
428      * Recursive method implementing 5.2.6.1 "generate the triple for the item" of
429      * <a href="http://dev.w3.org/html5/md/Overview.html#rdf">Microdata to RDF</a>
430      * extraction algorithm.
431      *
432      * @param itemScope
433      * @param documentURI
434      * @param out
435      * @param mappings
436      * @return
437      * @throws ExtractionException
438      */
439     private Resource processType(
440             ItemScope itemScope,
441             URI documentURI, ExtractionResult out,
442             Map<ItemScope, Resource> mappings
443     ) throws ExtractionException {
444         Resource subject;
445         if (mappings.containsKey(itemScope)) {
446             subject = mappings.get(itemScope);
447         } else if (isAbsoluteURL(itemScope.getItemId())) {
448             subject = RDFUtils.uri(itemScope.getItemId());
449         } else {
450             subject = RDFUtils.getBNode(Integer.toString(itemScope.hashCode()));
451         }
452         mappings.put(itemScope, subject);
453 
454         // ItemScope.type could be null, but surely it's a valid URL
455         String itemScopeType = "";
456         if (itemScope.getType() != null) {
457             String itemType;
458             itemType = itemScope.getType().toString();
459             out.writeTriple(subject, RDF.TYPE, RDFUtils.uri(itemType));
460             itemScopeType = itemScope.getType().toString();
461         }
462         for (String propName : itemScope.getProperties().keySet()) {
463             List<ItemProp> itemProps = itemScope.getProperties().get(propName);
464             for (ItemProp itemProp : itemProps) {
465                 try {
466                     processProperty(
467                             subject,
468                             propName,
469                             itemProp,
470                             itemScopeType,
471                             documentURI,
472                             mappings,
473                             out
474                     );
475                 } catch (MalformedURLException e) {
476                     throw new ExtractionException(
477                             "Error while processing on subject '" + subject +
478                                     "' the itemProp: '" + itemProp + "' "
479                     );
480                 }
481             }
482         }
483         return subject;
484     }
485 
486     private void processProperty(
487             Resource subject,
488             String propName,
489             ItemProp itemProp,
490             String itemScopeType,
491             URI documentURI,
492             Map<ItemScope, Resource> mappings,
493             ExtractionResult out
494     ) throws MalformedURLException, ExtractionException {
495         URI predicate;
496         if (!isAbsoluteURL(propName) && itemScopeType.equals("") && isStrict) {
497             return;
498         } else if (!isAbsoluteURL(propName) && itemScopeType.equals("") && !isStrict) {
499             predicate = RDFUtils.uri(
500                     toAbsoluteURL(
501                             defaultNamespace,
502                             propName,
503                             '/'
504                     ).toString()
505             );
506         } else {
507             predicate = RDFUtils.uri(
508                     toAbsoluteURL(
509                             itemScopeType,
510                             propName,
511                             '/'
512                     ).toString());
513         }
514         Value value;
515         Object propValue = itemProp.getValue().getContent();
516         ItemPropValue.Type propType = itemProp.getValue().getType();
517         if (propType.equals(ItemPropValue.Type.Nested)) {
518             value = processType((ItemScope) propValue, documentURI, out, mappings);
519         } else if (propType.equals(ItemPropValue.Type.Plain)) {
520             value = RDFUtils.literal((String) propValue, documentLanguage);
521         } else if (propType.equals(ItemPropValue.Type.Link)) {
522             value = RDFUtils.uri(
523                     toAbsoluteURL(
524                             documentURI.toString(),
525                             (String) propValue,
526                             '/'
527                     ).toString()
528             );
529         } else if (propType.equals(ItemPropValue.Type.Date)) {
530             value = RDFUtils.literal(ItemPropValue.formatDateTime((Date) propValue), XMLSchema.DATE);
531         } else {
532             throw new RuntimeException("Invalid Type '" +
533                     propType + "' for ItemPropValue with name: '" + propName + "'");
534         }
535         out.writeTriple(subject, predicate, value);
536     }
537 
538     private boolean isAbsoluteURL(String urlString) {
539         boolean result = false;
540         try {
541             URL url = new URL(urlString);
542             String protocol = url.getProtocol();
543             if (protocol != null && protocol.trim().length() > 0)
544                 result = true;
545         } catch (MalformedURLException e) {
546             return false;
547         }
548         return result;
549     }
550 
551     private URL toAbsoluteURL(String ns, String part, char trailing)
552             throws MalformedURLException {
553         if (isAbsoluteURL(part)) {
554             return new URL(part);
555         }
556         char lastChar = ns.charAt(ns.length() - 1);
557         if (lastChar == '#' || lastChar == '/')
558             return new URL(ns + part);
559         return new URL(ns + trailing + part);
560     }
561 
562     private void notifyError(MicrodataParserException[] errors, ExtractionResult out) {
563         for(MicrodataParserException mpe : errors) {
564             out.notifyIssue(
565                     IssueReport.IssueLevel.Error,
566                     mpe.toJSON(),
567                     mpe.getErrorLocationBeginRow(),
568                     mpe.getErrorLocationBeginCol()
569             );
570         }
571     }
572 
573 }