View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.ExtractorFactory;
24  import org.apache.any23.extractor.SimpleExtractorFactory;
25  import org.apache.any23.extractor.TagSoupExtractionResult;
26  import org.apache.any23.rdf.PopularPrefixes;
27  import org.apache.any23.vocab.WO;
28  import org.openrdf.model.BNode;
29  import org.openrdf.model.Resource;
30  import org.openrdf.model.URI;
31  import org.openrdf.model.vocabulary.RDF;
32  import org.w3c.dom.Node;
33  
34  import java.util.Arrays;
35  
36  /**
37   * Extractor able to extract the <a href="http://microformats.org/wiki/species">Species Microformat</a>.
38   * The data are represented using the
39   * <a href="http://www.bbc.co.uk/ontologies/wildlife/2010-02-22.shtml">BBC Wildlife Ontology</a>.
40   *
41   * @see org.apache.any23.vocab.WO
42   * @author Davide Palmisano (dpalmisano@gmail.com)
43   */
44  public class SpeciesExtractor extends EntityBasedMicroformatExtractor {
45  
46      private static final WO vWO = WO.getInstance();
47  
48      private static final String[] classes = {
49              "kingdom",
50              "division",
51              "phylum",
52              "order",
53              "family",
54              "genus",
55              "species",
56              "class",
57      };
58  
59      public final static ExtractorFactory<SpeciesExtractor> factory =
60              SimpleExtractorFactory.create(
61                      "html-mf-species",
62                      PopularPrefixes.createSubset("rdf", "wo"),
63                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
64                      "example-mf-species.html",
65                      SpeciesExtractor.class
66              );
67  
68      /**
69       * Returns the description of this extractor.
70       *
71       * @return a human readable description.
72       */
73      @Override
74      public ExtractorDescription getDescription() {
75          return factory;
76      }
77  
78      /**
79       * Returns the base class name for the extractor.
80       *
81       * @return a string containing the base of the extractor.
82       */
83      @Override
84      protected String getBaseClassName() {
85          return "biota";
86      }
87  
88      /**
89       * Resets the internal status of the extractor to prepare it to a new extraction section.
90       */
91      @Override
92      protected void resetExtractor() {
93          // empty
94      }
95  
96      /**
97       * Extracts an entity from a <i>DOM</i> node.
98       *
99       * @param node the DOM node.
100      * @param out  the extraction result collector.
101      * @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise.
102      * @throws org.apache.any23.extractor.ExtractionException
103      *
104      */
105     @Override
106     protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
107         BNode biota = getBlankNodeFor(node);
108         conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);
109 
110         final HTMLDocument fragment = new HTMLDocument(node);
111         addNames(fragment, biota);
112         addClasses(fragment, biota);
113 
114         final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
115         tser.addResourceRoot(
116                 DomUtils.getXPathListForNode(node),
117                 biota,
118                 this.getClass()
119         );
120 
121         return true;
122     }
123 
124     private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
125         HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
126         conditionallyAddStringProperty(
127                 binomial.source(), biota, vWO.scientificName, binomial.value()
128         );
129         HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
130         conditionallyAddStringProperty(
131                 vernacular.source(), biota, vWO.speciesName, vernacular.value()
132         );
133     }
134 
135     private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
136         for (String clazz : classes) {
137             HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
138             conditionallyAddStringProperty(
139                     classTextField.source(), biota, resolvePropertyName(clazz), classTextField.value());
140         }
141     }
142 
143     private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
144         for(String clazz : classes) {
145             HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
146             if(classTextField.source() != null) {
147                 BNode classBNode = getBlankNodeFor(classTextField.source());
148                 addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
149                 conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
150                 HTMLDocument fragment = new HTMLDocument(classTextField.source());
151                 addClassesName(fragment, classBNode);
152             }
153         }
154     }
155 
156     private URI resolvePropertyName(String clazz) {
157         return vWO.getProperty(
158                 String.format(
159                         "%sName",
160                         clazz
161                 )
162         );
163     }
164 
165     private URI resolveClassName(String clazz) {
166         String upperCaseClass = clazz.substring(0, 1);
167         return vWO.getClass(
168                 String.format("%s%s",
169                         upperCaseClass.toUpperCase(),
170                         clazz.substring(1)
171                 )
172         );
173     }
174 }