1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionResult;
21 import org.apache.any23.extractor.ExtractorDescription;
22 import org.apache.any23.extractor.ExtractorFactory;
23 import org.apache.any23.extractor.SimpleExtractorFactory;
24 import org.apache.any23.extractor.TagSoupExtractionResult;
25 import org.apache.any23.rdf.PopularPrefixes;
26 import org.apache.any23.vocab.DOAC;
27 import org.apache.any23.vocab.FOAF;
28 import org.openrdf.model.BNode;
29 import org.openrdf.model.Resource;
30 import org.openrdf.model.vocabulary.RDF;
31 import org.w3c.dom.Node;
32
33 import java.util.Arrays;
34 import java.util.List;
35
36
37
38
39
40
41
42 public class HResumeExtractor extends EntityBasedMicroformatExtractor {
43
44 private static final FOAF vFOAF = FOAF.getInstance();
45 private static final DOAC vDOAC = DOAC.getInstance();
46
47 public final static ExtractorFactory<HResumeExtractor> factory =
48 SimpleExtractorFactory.create(
49 "html-mf-hresume",
50 PopularPrefixes.createSubset("rdf", "doac", "foaf"),
51 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
52 "example-mf-hresume.html",
53 HResumeExtractor.class
54 );
55
56 public ExtractorDescription getDescription() {
57 return factory;
58 }
59
60 public String getBaseClassName() {
61 return "hresume";
62 }
63
64 @Override
65 protected void resetExtractor() {
66
67 }
68
69 @Override
70 protected boolean extractEntity(Node node, ExtractionResult out) {
71 if (null == node) return false;
72 BNode person = getBlankNodeFor(node);
73
74 out.writeTriple(person, RDF.TYPE, vFOAF.Person);
75 final HTMLDocument fragment = new HTMLDocument(node);
76 addSummary(fragment, person);
77 addContact(fragment, person);
78 addExperiences(fragment, person);
79 addEducations(fragment, person);
80 addAffiliations(fragment, person);
81 addSkills(fragment, person);
82
83 final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
84 tser.addResourceRoot(
85 DomUtils.getXPathListForNode(node),
86 person,
87 this.getClass()
88 );
89
90 return true;
91 }
92
93 private void addSummary(HTMLDocument doc, Resource person) {
94 HTMLDocument.TextField summary = doc.getSingularTextField("summary");
95 conditionallyAddStringProperty(
96 summary.source(),
97 person,
98 vDOAC.summary,
99 summary.value()
100 );
101 }
102
103 private void addContact(HTMLDocument doc, Resource person) {
104 List<Node> nodes = doc.findAllByClassName("contact");
105 if (nodes.size() > 0)
106 addBNodeProperty(
107 nodes.get(0),
108 person, vFOAF.isPrimaryTopicOf, getBlankNodeFor(nodes.get(0))
109 );
110 }
111
112 private void addExperiences(HTMLDocument doc, Resource person) {
113 List<Node> nodes = doc.findAllByClassName("experience");
114 for (Node node : nodes) {
115 BNode exp = valueFactory.createBNode();
116 if (addExperience(exp, new HTMLDocument(node)))
117 addBNodeProperty(
118 node,
119 person, vDOAC.experience, exp
120 );
121 }
122 }
123
124 private boolean addExperience(Resource exp, HTMLDocument document) {
125 final Node documentNode = document.getDocument();
126 String check = "";
127
128 HTMLDocument.TextField value = document.getSingularTextField("title");
129 check += value;
130 conditionallyAddStringProperty(value.source(), exp, vDOAC.title, value.value().trim());
131
132 value = document.getSingularTextField("dtstart");
133 check += value;
134 conditionallyAddStringProperty(documentNode, exp, vDOAC.start_date, value.value().trim());
135
136 value = document.getSingularTextField("dtend");
137 check += value;
138 conditionallyAddStringProperty(documentNode, exp, vDOAC.end_date, value.value().trim());
139
140 value = document.getSingularTextField("summary");
141 check += value;
142 conditionallyAddStringProperty(documentNode, exp, vDOAC.organization, value.value().trim());
143
144 return !"".equals(check);
145 }
146
147 private void addEducations(HTMLDocument doc, Resource person) {
148 List<Node> nodes = doc.findAllByClassName("education");
149 for (Node node : nodes) {
150 BNode exp = valueFactory.createBNode();
151 if (addExperience(exp, new HTMLDocument(node)))
152 addBNodeProperty(
153 node,
154 person, vDOAC.education, exp
155 );
156 }
157 }
158
159 private void addAffiliations(HTMLDocument doc, Resource person) {
160 List<Node> nodes = doc.findAllByClassName("affiliation");
161 for (Node node : nodes) {
162 addBNodeProperty(
163 node,
164 person, vDOAC.affiliation, getBlankNodeFor(node)
165 );
166 }
167 }
168
169 private void addSkills(HTMLDocument doc, Resource person) {
170 List<Node> nodes;
171
172
173 nodes = doc.findAllByClassName("skill");
174 for (Node node : nodes) {
175 conditionallyAddStringProperty(
176 node,
177 person, vDOAC.skill, extractSkillValue(node)
178 );
179 }
180
181 nodes = doc.findAllByClassName("skills");
182 for(Node node : nodes) {
183 String nodeText = node.getTextContent();
184 String[] skills = nodeText.split(",");
185 for(String skill : skills) {
186 conditionallyAddStringProperty(
187 node,
188 person, vDOAC.skill, skill.trim()
189 );
190 }
191 }
192 }
193
194 private String extractSkillValue(Node n) {
195 String name = n.getNodeName();
196 String skill = null;
197 if ("A".equals(name) && DomUtils.hasAttribute(n, "rel", "tag")) {
198 skill = n.getAttributes().getNamedItem("href").getTextContent();
199 } else {
200 skill = n.getTextContent();
201 }
202 return skill;
203 }
204
205 }