View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionResult;
21  import org.apache.any23.extractor.ExtractorDescription;
22  import org.apache.any23.extractor.ExtractorFactory;
23  import org.apache.any23.extractor.SimpleExtractorFactory;
24  import org.apache.any23.extractor.TagSoupExtractionResult;
25  import org.apache.any23.rdf.PopularPrefixes;
26  import org.apache.any23.vocab.DOAC;
27  import org.apache.any23.vocab.FOAF;
28  import org.openrdf.model.BNode;
29  import org.openrdf.model.Resource;
30  import org.openrdf.model.vocabulary.RDF;
31  import org.w3c.dom.Node;
32  
33  import java.util.Arrays;
34  import java.util.List;
35  
36  /**
37   * Extractor for the <a href="http://microformats.org/wiki/hresume">hResume</a>
38   * microformat.
39   *
40   * @author Gabriele Renzi
41   */
42  public class HResumeExtractor extends EntityBasedMicroformatExtractor {
43  
44      private static final FOAF vFOAF = FOAF.getInstance();
45      private static final DOAC vDOAC = DOAC.getInstance();
46  
47      public final static ExtractorFactory<HResumeExtractor> factory =
48              SimpleExtractorFactory.create(
49                      "html-mf-hresume",
50                      PopularPrefixes.createSubset("rdf", "doac", "foaf"),
51                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
52                      "example-mf-hresume.html",
53                      HResumeExtractor.class
54              );
55  
56      public ExtractorDescription getDescription() {
57          return factory;
58      }
59  
60      public String getBaseClassName() {
61          return "hresume";
62      }
63  
64      @Override
65      protected void resetExtractor() {
66          // Empty.
67      }
68  
69      @Override
70      protected boolean extractEntity(Node node, ExtractionResult out) {
71          if (null == node) return false;
72          BNode person = getBlankNodeFor(node);
73          // we have a person, at least
74          out.writeTriple(person, RDF.TYPE, vFOAF.Person);
75          final HTMLDocument fragment = new HTMLDocument(node);
76          addSummary(fragment, person);
77          addContact(fragment, person);
78          addExperiences(fragment, person);
79          addEducations(fragment, person);
80          addAffiliations(fragment, person);
81          addSkills(fragment, person);
82  
83          final TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
84          tser.addResourceRoot(
85                  DomUtils.getXPathListForNode(node),
86                  person,
87                  this.getClass()
88          );
89  
90          return true;
91      }
92  
93      private void addSummary(HTMLDocument doc, Resource person) {
94          HTMLDocument.TextField summary = doc.getSingularTextField("summary");
95          conditionallyAddStringProperty(
96                  summary.source(),
97                  person,
98                  vDOAC.summary,
99                  summary.value()
100         );
101     }
102 
103     private void addContact(HTMLDocument doc, Resource person) {
104         List<Node> nodes = doc.findAllByClassName("contact");
105         if (nodes.size() > 0)
106             addBNodeProperty(
107                     nodes.get(0),
108                     person, vFOAF.isPrimaryTopicOf, getBlankNodeFor(nodes.get(0))
109             );
110     }
111 
112     private void addExperiences(HTMLDocument doc, Resource person) {
113         List<Node> nodes = doc.findAllByClassName("experience");
114         for (Node node : nodes) {
115             BNode exp = valueFactory.createBNode();
116             if (addExperience(exp, new HTMLDocument(node)))
117             addBNodeProperty(
118                     node,
119                     person, vDOAC.experience, exp
120             );
121         }
122     }
123 
124     private boolean addExperience(Resource exp, HTMLDocument document) {
125         final Node documentNode    = document.getDocument();
126         String check = "";
127 
128         HTMLDocument.TextField value = document.getSingularTextField("title");
129         check += value;
130         conditionallyAddStringProperty(value.source(), exp, vDOAC.title, value.value().trim());
131 
132         value = document.getSingularTextField("dtstart");
133         check += value;
134         conditionallyAddStringProperty(documentNode, exp, vDOAC.start_date, value.value().trim());
135 
136         value = document.getSingularTextField("dtend");
137         check += value;
138         conditionallyAddStringProperty(documentNode, exp, vDOAC.end_date, value.value().trim());
139 
140         value = document.getSingularTextField("summary");
141         check += value;
142         conditionallyAddStringProperty(documentNode, exp, vDOAC.organization, value.value().trim());
143 
144         return !"".equals(check);
145     }
146 
147     private void addEducations(HTMLDocument doc, Resource person) {
148         List<Node> nodes = doc.findAllByClassName("education");
149         for (Node node : nodes) {
150             BNode exp = valueFactory.createBNode();
151             if (addExperience(exp, new HTMLDocument(node)))
152             addBNodeProperty(
153                     node,
154                     person, vDOAC.education, exp
155             );
156         }
157     }
158 
159     private void addAffiliations(HTMLDocument doc, Resource person) {
160         List<Node> nodes = doc.findAllByClassName("affiliation");
161         for (Node node : nodes) {
162             addBNodeProperty(
163                     node,
164                     person, vDOAC.affiliation, getBlankNodeFor(node)
165             );
166         }
167     }
168 
169     private void addSkills(HTMLDocument doc, Resource person) {
170         List<Node> nodes;
171 
172         // Extracting data from single node.
173         nodes = doc.findAllByClassName("skill");
174         for (Node node : nodes) {
175             conditionallyAddStringProperty(
176                     node,
177                     person, vDOAC.skill, extractSkillValue(node)
178             );
179         }
180         // Extracting from enlisting node.
181         nodes = doc.findAllByClassName("skills");
182         for(Node node : nodes) {
183             String nodeText = node.getTextContent();
184             String[] skills = nodeText.split(",");
185             for(String skill : skills) {
186                 conditionallyAddStringProperty(
187                         node,
188                         person, vDOAC.skill, skill.trim()
189                 );
190             }
191         }
192     }
193 
194     private String extractSkillValue(Node n) {
195         String name = n.getNodeName();
196         String skill = null;
197         if ("A".equals(name) && DomUtils.hasAttribute(n, "rel", "tag")) {
198             skill = n.getAttributes().getNamedItem("href").getTextContent();
199         } else {
200             skill = n.getTextContent();
201         }
202         return skill;
203     }
204 
205 }