View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.vocab.HItem;
25  import org.eclipse.rdf4j.model.BNode;
26  import org.eclipse.rdf4j.model.IRI;
27  import org.eclipse.rdf4j.model.vocabulary.RDF;
28  import org.w3c.dom.Node;
29  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
30  import org.apache.any23.extractor.html.HTMLDocument;
31  
32  /**
33   * Extractor for the <a href="http://microformats.org/wiki/h-item">h-item</a> microformat.
34   *
35   * @author Nisala Nirmana
36   */
37  public class HItemExtractor extends EntityBasedMicroformatExtractor {
38  
39      private static final HItem vHITEM = HItem.getInstance();
40  
41      private static final String[] itemFields = { "name", "url", "photo" };
42  
43      @Override
44      public ExtractorDescription getDescription() {
45          return HItemExtractorFactory.getDescriptionInstance();
46      }
47  
48      protected String getBaseClassName() {
49          return Microformats2Prefixes.CLASS_PREFIX + "item";
50      }
51  
52      @Override
53      protected void resetExtractor() {
54          // Empty.
55      }
56  
57      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
58          if (null == node)
59              return false;
60          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument document = new HTMLDocument(node);
61          BNode item = getBlankNodeFor(node);
62          out.writeTriple(item, RDF.TYPE, vHITEM.Item);
63          final String extractorName = getDescription().getExtractorName();
64          addName(document, item);
65          addPhotos(document, item);
66          addUrls(document, item);
67          final TagSoupExtractionResult../org/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
68          tser.addResourceRoot(document.getPathToLocalRoot(), item, this.getClass());
69          return true;
70      }
71  
72      private void mapFieldWithProperty(HTMLDocument fragment, BNode item, String fieldClass, IRI property) {
73          HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
74          conditionallyAddStringProperty(title.source(), item, property, title.value());
75      }
76  
77      private void addName(HTMLDocument fragment, BNode item) {
78          mapFieldWithProperty(fragment, item, Microformats2Prefixes.PROPERTY_PREFIX + itemFields[0], vHITEM.name);
79      }
80  
81      private void addPhotos(HTMLDocument fragment, BNode item) throws ExtractionException {
82          final HTMLDocument.TextField[] photos = fragment
83                  .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + itemFields[2]);
84          for (HTMLDocument.TextField photo : photos) {
85              addIRIProperty(item, vHITEM.photo, fragment.resolveIRI(photo.value()));
86          }
87      }
88  
89      private void addUrls(HTMLDocument fragment, BNode item) throws ExtractionException {
90          HTMLDocument.TextField[] links = fragment
91                  .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + itemFields[1]);
92          for (HTMLDocument.TextField link : links) {
93              conditionallyAddResourceProperty(item, vHITEM.url, getHTMLDocument().resolveIRI(link.value()));
94          }
95      }
96  }