View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.vocab.WO;
25  import org.eclipse.rdf4j.model.BNode;
26  import org.eclipse.rdf4j.model.Resource;
27  import org.eclipse.rdf4j.model.IRI;
28  import org.eclipse.rdf4j.model.vocabulary.RDF;
29  import org.w3c.dom.Node;
30  
31  /**
32   * Extractor able to extract the <a href="http://microformats.org/wiki/species">Species Microformat</a>.
33   * The data are represented using the
34   * <a href="http://www.bbc.co.uk/ontologies/wildlife/2010-02-22.shtml">BBC Wildlife Ontology</a>.
35   *
36   * @see org.apache.any23.vocab.WO
37   * @author Davide Palmisano (dpalmisano@gmail.com)
38   */
39  public class SpeciesExtractor extends EntityBasedMicroformatExtractor {
40  
41      private static final WO vWO = WO.getInstance();
42  
43      private static final String[] classes = {
44              "kingdom",
45              "phylum",
46              "order",
47              "family",
48              "genus",
49              "species",
50              "class",
51      };
52  
53      /**
54       * Returns the description of this extractor.
55       *
56       * @return a human readable description.
57       */
58      @Override
59      public ExtractorDescription getDescription() {
60          return SpeciesExtractorFactory.getDescriptionInstance();
61      }
62  
63      /**
64       * Returns the base class name for the extractor.
65       *
66       * @return a string containing the base of the extractor.
67       */
68      @Override
69      protected String getBaseClassName() {
70          return "biota";
71      }
72  
73      /**
74       * Resets the internal status of the extractor to prepare it to a new extraction section.
75       */
76      @Override
77      protected void resetExtractor() {
78          // empty
79      }
80  
81      /**
82       * Extracts an entity from a <i>DOM</i> node.
83       *
84       * @param node the DOM node.
85       * @param out  the extraction result collector.
86       * @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise.
87       * @throws org.apache.any23.extractor.ExtractionException if there is an error during extraction
88       *
89       */
90      @Override
91      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
92          BNode biota = getBlankNodeFor(node);
93          conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);
94  
95          final HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
96          addNames(fragment, biota);
97          addClasses(fragment, biota);
98  
99          final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
100         tser.addResourceRoot(
101                 DomUtils.getXPathListForNode(node),
102                 biota,
103                 this.getClass()
104         );
105 
106         return true;
107     }
108 
109     private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
110         HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
111         conditionallyAddStringProperty(
112                 binomial.source(), biota, vWO.scientificName, binomial.value()
113         );
114         HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
115         conditionallyAddStringProperty(
116                 vernacular.source(), biota, vWO.speciesName, vernacular.value()
117         );
118     }
119 
120     private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
121         for (String clazz : classes) {
122             HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
123             conditionallyAddStringProperty(
124                     classTextField.source(), biota, resolvePropertyName(clazz), classTextField.value());
125         }
126     }
127 
128     private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
129         for(String clazz : classes) {
130             HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
131             if(classTextField.source() != null) {
132                 BNode classBNode = getBlankNodeFor(classTextField.source());
133                 addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
134                 conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
135                 HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(classTextField.source());
136                 addClassesName(fragment, classBNode);
137             }
138         }
139     }
140 
141     private IRI resolvePropertyName(String clazz) {
142         return vWO.getProperty(
143                 String.format(
144                         "%sName",
145                         clazz
146                 )
147         );
148     }
149 
150     private IRI resolveClassName(String clazz) {
151         String upperCaseClass = clazz.substring(0, 1);
152         return vWO.getClass(
153                 String.format("%s%s",
154                         upperCaseClass.toUpperCase(),
155                         clazz.substring(1)
156                 )
157         );
158     }
159 }