View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionResult;
21  import org.apache.any23.extractor.ExtractorDescription;
22  import org.apache.any23.extractor.TagSoupExtractionResult;
23  import org.apache.any23.vocab.DOAC;
24  import org.apache.any23.vocab.FOAF;
25  import org.eclipse.rdf4j.model.BNode;
26  import org.eclipse.rdf4j.model.Resource;
27  import org.eclipse.rdf4j.model.vocabulary.RDF;
28  import org.w3c.dom.Node;
29  
30  import java.util.List;
31  
32  /**
33   * Extractor for the <a href="http://microformats.org/wiki/hresume">hResume</a>
34   * microformat.
35   *
36   * @author Gabriele Renzi
37   */
38  public class HResumeExtractor extends EntityBasedMicroformatExtractor {
39  
40      private static final FOAF vFOAF = FOAF.getInstance();
41      private static final DOAC vDOAC = DOAC.getInstance();
42  
43      @Override
44      public ExtractorDescription getDescription() {
45          return HResumeExtractorFactory.getDescriptionInstance();
46      }
47  
48      @Override
49      public String getBaseClassName() {
50          return "hresume";
51      }
52  
53      @Override
54      protected void resetExtractor() {
55          // Empty.
56      }
57  
58      @Override
59      protected boolean extractEntity(Node node, ExtractionResult out) {
60          if (null == node) return false;
61          BNode person = getBlankNodeFor(node);
62          // we have a person, at least
63          out.writeTriple(person, RDF.TYPE, vFOAF.Person);
64          final HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
65          addSummary(fragment, person);
66          addContact(fragment, person);
67          addExperiences(fragment, person);
68          addEducations(fragment, person);
69          addAffiliations(fragment, person);
70          addSkills(fragment, person);
71  
72          final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
73          tser.addResourceRoot(
74                  DomUtils.getXPathListForNode(node),
75                  person,
76                  this.getClass()
77          );
78  
79          return true;
80      }
81  
82      private void addSummary(HTMLDocument doc, Resource person) {
83          HTMLDocument.TextField summary = doc.getSingularTextField("summary");
84          conditionallyAddStringProperty(
85                  summary.source(),
86                  person,
87                  vDOAC.summary,
88                  summary.value()
89          );
90      }
91  
92      private void addContact(HTMLDocument doc, Resource person) {
93          List<Node> nodes = doc.findAllByClassName("contact");
94          if (nodes.size() > 0)
95              addBNodeProperty(
96                      nodes.get(0),
97                      person, vFOAF.isPrimaryTopicOf, getBlankNodeFor(nodes.get(0))
98              );
99      }
100 
101     private void addExperiences(HTMLDocument doc, Resource person) {
102         List<Node> nodes = doc.findAllByClassName("experience");
103         for (Node node : nodes) {
104             BNode exp = valueFactory.createBNode();
105             if (addExperience(exp, new HTMLDocument(node)))
106             addBNodeProperty(
107                     node,
108                     person, vDOAC.experience, exp
109             );
110         }
111     }
112 
113     private boolean addExperience(Resource exp, HTMLDocument document) {
114         final Node documentNode    = document.getDocument();
115         String check = "";
116 
117         HTMLDocument.TextField value = document.getSingularTextField("title");
118         check += value;
119         conditionallyAddStringProperty(value.source(), exp, vDOAC.title, value.value().trim());
120 
121         value = document.getSingularTextField("dtstart");
122         check += value;
123         conditionallyAddStringProperty(documentNode, exp, vDOAC.start_date, value.value().trim());
124 
125         value = document.getSingularTextField("dtend");
126         check += value;
127         conditionallyAddStringProperty(documentNode, exp, vDOAC.end_date, value.value().trim());
128 
129         value = document.getSingularTextField("summary");
130         check += value;
131         conditionallyAddStringProperty(documentNode, exp, vDOAC.organization, value.value().trim());
132 
133         return !"".equals(check);
134     }
135 
136     private void addEducations(HTMLDocument doc, Resource person) {
137         List<Node> nodes = doc.findAllByClassName("education");
138         for (Node node : nodes) {
139             BNode exp = valueFactory.createBNode();
140             if (addExperience(exp, new HTMLDocument(node)))
141             addBNodeProperty(
142                     node,
143                     person, vDOAC.education, exp
144             );
145         }
146     }
147 
148     private void addAffiliations(HTMLDocument doc, Resource person) {
149         List<Node> nodes = doc.findAllByClassName("affiliation");
150         for (Node node : nodes) {
151             addBNodeProperty(
152                     node,
153                     person, vDOAC.affiliation, getBlankNodeFor(node)
154             );
155         }
156     }
157 
158     private void addSkills(HTMLDocument doc, Resource person) {
159         List<Node> nodes;
160 
161         // Extracting data from single node.
162         nodes = doc.findAllByClassName("skill");
163         for (Node node : nodes) {
164             conditionallyAddStringProperty(
165                     node,
166                     person, vDOAC.skill, extractSkillValue(node)
167             );
168         }
169         // Extracting from enlisting node.
170         nodes = doc.findAllByClassName("skills");
171         for(Node node : nodes) {
172             String nodeText = node.getTextContent();
173             String[] skills = nodeText.split(",");
174             for(String skill : skills) {
175                 conditionallyAddStringProperty(
176                         node,
177                         person, vDOAC.skill, skill.trim()
178                 );
179             }
180         }
181     }
182 
183     private String extractSkillValue(Node n) {
184         String name = n.getNodeName();
185         String skill = null;
186         if ("A".equals(name) && DomUtils.hasAttribute(n, "rel", "tag")) {
187             skill = n.getAttributes().getNamedItem("href").getTextContent();
188         } else {
189             skill = n.getTextContent();
190         }
191         return skill;
192     }
193 
194 }