View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.vocab.HItem;
25  import org.eclipse.rdf4j.model.BNode;
26  import org.eclipse.rdf4j.model.IRI;
27  import org.eclipse.rdf4j.model.vocabulary.RDF;
28  import org.w3c.dom.Node;
29  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
30  import org.apache.any23.extractor.html.HTMLDocument;
31  
32  /**
33   * Extractor for the <a href="http://microformats.org/wiki/h-item">h-item</a>
34   * microformat.
35   *
36   * @author Nisala Nirmana
37   */
38  public class HItemExtractor extends EntityBasedMicroformatExtractor {
39  
40      private static final HItem vHITEM = HItem.getInstance();
41  
42      private static final String[] itemFields = {
43              "name",
44              "url",
45              "photo"
46      };
47  
48      @Override
49      public ExtractorDescription getDescription() {
50          return HItemExtractorFactory.getDescriptionInstance();
51      }
52  
53      protected String getBaseClassName() {
54          return Microformats2Prefixes.CLASS_PREFIX+"item";
55      }
56  
57      @Override
58      protected void resetExtractor() {
59          // Empty.
60      }
61  
62      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException{
63          if (null == node) return false;
64          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument document = new HTMLDocument(node);
65          BNode item = getBlankNodeFor(node);
66          out.writeTriple(item, RDF.TYPE, vHITEM.Item);
67          final String extractorName = getDescription().getExtractorName();
68          addName(document,item);
69          addPhotos(document,item);
70          addUrls(document,item);
71          final TagSoupExtractionResult../org/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
72          tser.addResourceRoot(document.getPathToLocalRoot(), item, this.getClass());
73          return true;
74      }
75  
76      private void mapFieldWithProperty(HTMLDocument fragment, BNode item, String fieldClass, IRI property) {
77          HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
78          conditionallyAddStringProperty(
79                  title.source(),item, property, title.value()
80          );
81      }
82  
83      private void addName(HTMLDocument fragment, BNode item) {
84          mapFieldWithProperty(fragment, item, Microformats2Prefixes.PROPERTY_PREFIX+itemFields[0], vHITEM.name);
85      }
86  
87      private void addPhotos(HTMLDocument fragment, BNode item) throws ExtractionException {
88          final HTMLDocument.TextField[] photos = fragment.getPluralUrlField
89                  (Microformats2Prefixes.URL_PROPERTY_PREFIX+itemFields[2]);
90          for(HTMLDocument.TextField photo : photos) {
91              addIRIProperty(item, vHITEM.photo, fragment.resolveIRI(photo.value()));
92          }
93      }
94  
95      private void addUrls(HTMLDocument fragment, BNode item) throws ExtractionException {
96          HTMLDocument.TextField[] links = fragment.getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX+
97                  itemFields[1]);
98          for (HTMLDocument.TextField link : links) {
99              conditionallyAddResourceProperty(item, vHITEM.url, getHTMLDocument().resolveIRI(link.value()));
100         }
101     }
102 }