View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionResult;
21  import org.apache.any23.extractor.ExtractorDescription;
22  import org.apache.any23.extractor.TagSoupExtractionResult;
23  import org.apache.any23.vocab.DOAC;
24  import org.apache.any23.vocab.FOAF;
25  import org.eclipse.rdf4j.model.BNode;
26  import org.eclipse.rdf4j.model.Resource;
27  import org.eclipse.rdf4j.model.vocabulary.RDF;
28  import org.w3c.dom.Node;
29  
30  import java.util.List;
31  
32  /**
33   * Extractor for the <a href="http://microformats.org/wiki/hresume">hResume</a> microformat.
34   *
35   * @author Gabriele Renzi
36   */
37  public class HResumeExtractor extends EntityBasedMicroformatExtractor {
38  
39      private static final FOAF vFOAF = FOAF.getInstance();
40      private static final DOAC vDOAC = DOAC.getInstance();
41  
42      @Override
43      public ExtractorDescription getDescription() {
44          return HResumeExtractorFactory.getDescriptionInstance();
45      }
46  
47      @Override
48      public String getBaseClassName() {
49          return "hresume";
50      }
51  
52      @Override
53      protected void resetExtractor() {
54          // Empty.
55      }
56  
57      @Override
58      protected boolean extractEntity(Node node, ExtractionResult out) {
59          if (null == node)
60              return false;
61          BNode person = getBlankNodeFor(node);
62          // we have a person, at least
63          out.writeTriple(person, RDF.TYPE, vFOAF.Person);
64          final HTMLDocumentTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
65          addSummary(fragment, person);
66          addContact(fragment, person);
67          addExperiences(fragment, person);
68          addEducations(fragment, person);
69          addAffiliations(fragment, person);
70          addSkills(fragment, person);
71  
72          final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
73          tser.addResourceRoot(DomUtils.getXPathListForNode(node), person, this.getClass());
74  
75          return true;
76      }
77  
78      private void addSummary(HTMLDocument doc, Resource person) {
79          HTMLDocument.TextField summary = doc.getSingularTextField("summary");
80          conditionallyAddStringProperty(summary.source(), person, vDOAC.summary, summary.value());
81      }
82  
83      private void addContact(HTMLDocument doc, Resource person) {
84          List<Node> nodes = doc.findAllByClassName("contact");
85          if (nodes.size() > 0)
86              addBNodeProperty(nodes.get(0), person, vFOAF.isPrimaryTopicOf, getBlankNodeFor(nodes.get(0)));
87      }
88  
89      private void addExperiences(HTMLDocument doc, Resource person) {
90          List<Node> nodes = doc.findAllByClassName("experience");
91          for (Node node : nodes) {
92              BNode exp = valueFactory.createBNode();
93              if (addExperience(exp, new HTMLDocument(node)))
94                  addBNodeProperty(node, person, vDOAC.experience, exp);
95          }
96      }
97  
98      private boolean addExperience(Resource exp, HTMLDocument document) {
99          final Node documentNode = document.getDocument();
100         String check = "";
101 
102         HTMLDocument.TextField value = document.getSingularTextField("title");
103         check += value;
104         conditionallyAddStringProperty(value.source(), exp, vDOAC.title, value.value().trim());
105 
106         value = document.getSingularTextField("dtstart");
107         check += value;
108         conditionallyAddStringProperty(documentNode, exp, vDOAC.start_date, value.value().trim());
109 
110         value = document.getSingularTextField("dtend");
111         check += value;
112         conditionallyAddStringProperty(documentNode, exp, vDOAC.end_date, value.value().trim());
113 
114         value = document.getSingularTextField("summary");
115         check += value;
116         conditionallyAddStringProperty(documentNode, exp, vDOAC.organization, value.value().trim());
117 
118         return !"".equals(check);
119     }
120 
121     private void addEducations(HTMLDocument doc, Resource person) {
122         List<Node> nodes = doc.findAllByClassName("education");
123         for (Node node : nodes) {
124             BNode exp = valueFactory.createBNode();
125             if (addExperience(exp, new HTMLDocument(node)))
126                 addBNodeProperty(node, person, vDOAC.education, exp);
127         }
128     }
129 
130     private void addAffiliations(HTMLDocument doc, Resource person) {
131         List<Node> nodes = doc.findAllByClassName("affiliation");
132         for (Node node : nodes) {
133             addBNodeProperty(node, person, vDOAC.affiliation, getBlankNodeFor(node));
134         }
135     }
136 
137     private void addSkills(HTMLDocument doc, Resource person) {
138         List<Node> nodes;
139 
140         // Extracting data from single node.
141         nodes = doc.findAllByClassName("skill");
142         for (Node node : nodes) {
143             conditionallyAddStringProperty(node, person, vDOAC.skill, extractSkillValue(node));
144         }
145         // Extracting from enlisting node.
146         nodes = doc.findAllByClassName("skills");
147         for (Node node : nodes) {
148             String nodeText = node.getTextContent();
149             String[] skills = nodeText.split(",");
150             for (String skill : skills) {
151                 conditionallyAddStringProperty(node, person, vDOAC.skill, skill.trim());
152             }
153         }
154     }
155 
156     private String extractSkillValue(Node n) {
157         String name = n.getNodeName();
158         String skill = null;
159         if ("A".equals(name) && DomUtils.hasAttribute(n, "rel", "tag")) {
160             skill = n.getAttributes().getNamedItem("href").getTextContent();
161         } else {
162             skill = n.getTextContent();
163         }
164         return skill;
165     }
166 
167 }