View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import java.util.Locale;
21  
22  import org.apache.any23.extractor.ExtractionException;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.ExtractorDescription;
25  import org.apache.any23.extractor.TagSoupExtractionResult;
26  import org.apache.any23.vocab.WO;
27  import org.eclipse.rdf4j.model.BNode;
28  import org.eclipse.rdf4j.model.Resource;
29  import org.eclipse.rdf4j.model.IRI;
30  import org.eclipse.rdf4j.model.vocabulary.RDF;
31  import org.w3c.dom.Node;
32  
33  /**
34   * Extractor able to extract the <a href="http://microformats.org/wiki/species">Species Microformat</a>. The data are
35   * represented using the <a href="http://www.bbc.co.uk/ontologies/wildlife/2010-02-22.shtml">BBC Wildlife Ontology</a>.
36   *
37   * @see org.apache.any23.vocab.WO
38   * 
39   * @author Davide Palmisano (dpalmisano@gmail.com)
40   */
41  public class SpeciesExtractor extends EntityBasedMicroformatExtractor {
42  
43      private static final WO vWO = WO.getInstance();
44  
45      private static final String[] classes = { "kingdom", "phylum", "order", "family", "genus", "species", "class", };
46  
47      /**
48       * Returns the description of this extractor.
49       *
50       * @return a human readable description.
51       */
52      @Override
53      public ExtractorDescription getDescription() {
54          return SpeciesExtractorFactory.getDescriptionInstance();
55      }
56  
57      /**
58       * Returns the base class name for the extractor.
59       *
60       * @return a string containing the base of the extractor.
61       */
62      @Override
63      protected String getBaseClassName() {
64          return "biota";
65      }
66  
67      /**
68       * Resets the internal status of the extractor to prepare it to a new extraction section.
69       */
70      @Override
71      protected void resetExtractor() {
72          // empty
73      }
74  
75      /**
76       * Extracts an entity from a <i>DOM</i> node.
77       *
78       * @param node
79       *            the DOM node.
80       * @param out
81       *            the extraction result collector.
82       * 
83       * @return <code>true</code> if the extraction has produces something, <code>false</code> otherwise.
84       * 
85       * @throws org.apache.any23.extractor.ExtractionException
86       *             if there is an error during extraction
87       *
88       */
89      @Override
90      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
91          BNode biota = getBlankNodeFor(node);
92          conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);
93  
94          final HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
95          addNames(fragment, biota);
96          addClasses(fragment, biota);
97  
98          final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
99          tser.addResourceRoot(DomUtils.getXPathListForNode(node), biota, this.getClass());
100 
101         return true;
102     }
103 
104     private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
105         HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
106         conditionallyAddStringProperty(binomial.source(), biota, vWO.scientificName, binomial.value());
107         HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
108         conditionallyAddStringProperty(vernacular.source(), biota, vWO.speciesName, vernacular.value());
109     }
110 
111     private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
112         for (String clazz : classes) {
113             HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
114             conditionallyAddStringProperty(classTextField.source(), biota, resolvePropertyName(clazz),
115                     classTextField.value());
116         }
117     }
118 
119     private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
120         for (String clazz : classes) {
121             HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
122             if (classTextField.source() != null) {
123                 BNode classBNode = getBlankNodeFor(classTextField.source());
124                 addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
125                 conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
126                 HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(classTextField.source());
127                 addClassesName(fragment, classBNode);
128             }
129         }
130     }
131 
132     private IRI resolvePropertyName(String clazz) {
133         return vWO.getProperty(String.format(Locale.ROOT, "%sName", clazz));
134     }
135 
136     private IRI resolveClassName(String clazz) {
137         String upperCaseClass = clazz.substring(0, 1);
138         return vWO.getClass(
139                 String.format(Locale.ROOT, "%s%s", upperCaseClass.toUpperCase(Locale.ROOT), clazz.substring(1)));
140     }
141 }