1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractionResult;
22 import org.apache.any23.extractor.ExtractorDescription;
23 import org.apache.any23.extractor.ExtractorFactory;
24 import org.apache.any23.extractor.SimpleExtractorFactory;
25 import org.apache.any23.extractor.TagSoupExtractionResult;
26 import org.apache.any23.rdf.PopularPrefixes;
27 import org.apache.any23.vocab.WO;
28 import org.openrdf.model.BNode;
29 import org.openrdf.model.Resource;
30 import org.openrdf.model.URI;
31 import org.openrdf.model.vocabulary.RDF;
32 import org.w3c.dom.Node;
33
34 import java.util.Arrays;
35
36
37
38
39
40
41
42
43
44 public class SpeciesExtractor extends EntityBasedMicroformatExtractor {
45
46 private static final WO vWO = WO.getInstance();
47
48 private static final String[] classes = {
49 "kingdom",
50 "division",
51 "phylum",
52 "order",
53 "family",
54 "genus",
55 "species",
56 "class",
57 };
58
59 public final static ExtractorFactory<SpeciesExtractor> factory =
60 SimpleExtractorFactory.create(
61 "html-mf-species",
62 PopularPrefixes.createSubset("rdf", "wo"),
63 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
64 "example-mf-species.html",
65 SpeciesExtractor.class
66 );
67
68
69
70
71
72
73 @Override
74 public ExtractorDescription getDescription() {
75 return factory;
76 }
77
78
79
80
81
82
83 @Override
84 protected String getBaseClassName() {
85 return "biota";
86 }
87
88
89
90
91 @Override
92 protected void resetExtractor() {
93
94 }
95
96
97
98
99
100
101
102
103
104
105 @Override
106 protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
107 BNode biota = getBlankNodeFor(node);
108 conditionallyAddResourceProperty(biota, RDF.TYPE, vWO.species);
109
110 final HTMLDocument fragment = new HTMLDocument(node);
111 addNames(fragment, biota);
112 addClasses(fragment, biota);
113
114 final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
115 tser.addResourceRoot(
116 DomUtils.getXPathListForNode(node),
117 biota,
118 this.getClass()
119 );
120
121 return true;
122 }
123
124 private void addNames(HTMLDocument doc, Resource biota) throws ExtractionException {
125 HTMLDocument.TextField binomial = doc.getSingularTextField("binomial");
126 conditionallyAddStringProperty(
127 binomial.source(), biota, vWO.scientificName, binomial.value()
128 );
129 HTMLDocument.TextField vernacular = doc.getSingularTextField("vernacular");
130 conditionallyAddStringProperty(
131 vernacular.source(), biota, vWO.speciesName, vernacular.value()
132 );
133 }
134
135 private void addClassesName(HTMLDocument doc, Resource biota) throws ExtractionException {
136 for (String clazz : classes) {
137 HTMLDocument.TextField classTextField = doc.getSingularTextField(clazz);
138 conditionallyAddStringProperty(
139 classTextField.source(), biota, resolvePropertyName(clazz), classTextField.value());
140 }
141 }
142
143 private void addClasses(HTMLDocument doc, Resource biota) throws ExtractionException {
144 for(String clazz : classes) {
145 HTMLDocument.TextField classTextField = doc.getSingularUrlField(clazz);
146 if(classTextField.source() != null) {
147 BNode classBNode = getBlankNodeFor(classTextField.source());
148 addBNodeProperty(biota, vWO.getProperty(clazz), classBNode);
149 conditionallyAddResourceProperty(classBNode, RDF.TYPE, resolveClassName(clazz));
150 HTMLDocument fragment = new HTMLDocument(classTextField.source());
151 addClassesName(fragment, classBNode);
152 }
153 }
154 }
155
156 private URI resolvePropertyName(String clazz) {
157 return vWO.getProperty(
158 String.format(
159 "%sName",
160 clazz
161 )
162 );
163 }
164
165 private URI resolveClassName(String clazz) {
166 String upperCaseClass = clazz.substring(0, 1);
167 return vWO.getClass(
168 String.format("%s%s",
169 upperCaseClass.toUpperCase(),
170 clazz.substring(1)
171 )
172 );
173 }
174 }