View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.vocab.HResume;
25  import org.eclipse.rdf4j.model.BNode;
26  import org.eclipse.rdf4j.model.Resource;
27  import org.eclipse.rdf4j.model.vocabulary.RDF;
28  import org.w3c.dom.Node;
29  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
30  import org.apache.any23.extractor.html.HTMLDocument;
31  import org.apache.any23.extractor.html.DomUtils;
32  import java.util.List;
33  
34  /**
35   * Extractor for the <a href="http://microformats.org/wiki/hresume">hResume</a> microformat.
36   *
37   * @author Nisala Nirmana
38   */
39  public class HResumeExtractor extends EntityBasedMicroformatExtractor {
40  
41      private static final HResume vResume = HResume.getInstance();
42  
43      private static final String[] resumeFields = { "name", "summary", "contact", "education", "experience", "skill",
44              "affiliation" };
45  
46      @Override
47      public ExtractorDescription getDescription() {
48          return HResumeExtractorFactory.getDescriptionInstance();
49      }
50  
51      @Override
52      public String getBaseClassName() {
53          return Microformats2Prefixes.CLASS_PREFIX + "resume";
54      }
55  
56      @Override
57      protected void resetExtractor() {
58          // Empty.
59      }
60  
61      @Override
62      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
63          if (null == node)
64              return false;
65          BNode person = getBlankNodeFor(node);
66          out.writeTriple(person, RDF.TYPE, vResume.Resume);
67          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
68  
69          addName(fragment, person);
70          addSummary(fragment, person);
71          addSkills(fragment, person);
72  
73          addExperiences(fragment, person);
74          addEducations(fragment, person);
75  
76          addAffiliations(fragment, person);
77          addContacts(fragment, person);
78  
79          final TagSoupExtractionResult../org/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
80          tser.addResourceRoot(DomUtils.getXPathListForNode(node), person, this.getClass());
81  
82          return true;
83      }
84  
85      private void addContacts(HTMLDocument doc, Resource entry) throws ExtractionException {
86          List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[2]
87                  + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
88          if (nodes.isEmpty())
89              return;
90          HCardExtractorFactoryormats2/HCardExtractorFactory.html#HCardExtractorFactory">HCardExtractorFactory factory = new HCardExtractorFactory();
91          HCardExtractor extractor = factory.createExtractor();
92          for (Node node : nodes) {
93              BNode contact = valueFactory.createBNode();
94              addIRIProperty(contact, RDF.TYPE, vResume.contact);
95              extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), contact, getCurrentExtractionResult());
96          }
97      }
98  
99      private void addAffiliations(HTMLDocument doc, Resource entry) throws ExtractionException {
100         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[6]
101                 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
102         if (nodes.isEmpty())
103             return;
104         HCardExtractorFactoryormats2/HCardExtractorFactory.html#HCardExtractorFactory">HCardExtractorFactory factory = new HCardExtractorFactory();
105         HCardExtractor extractor = factory.createExtractor();
106         for (Node node : nodes) {
107             BNode affiliation = valueFactory.createBNode();
108             addIRIProperty(affiliation, RDF.TYPE, vResume.affiliation);
109             extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), affiliation,
110                     getCurrentExtractionResult());
111         }
112     }
113 
114     private void addName(HTMLDocument doc, Resource person) {
115         HTMLDocument.TextField name = doc.getSingularTextField(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[0]);
116         conditionallyAddStringProperty(name.source(), person, vResume.name, name.value());
117     }
118 
119     private void addSummary(HTMLDocument doc, Resource person) {
120         HTMLDocument.TextField summary = doc
121                 .getSingularTextField(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[1]);
122         conditionallyAddStringProperty(summary.source(), person, vResume.summary, summary.value());
123     }
124 
125     private void addSkills(HTMLDocument doc, Resource person) {
126         final HTMLDocument.TextField[] skills = doc
127                 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[5]);
128         for (HTMLDocument.TextField skill : skills) {
129             conditionallyAddStringProperty(skill.source(), person, vResume.skill, skill.value());
130         }
131 
132     }
133 
134     private void addExperiences(HTMLDocument doc, Resource person) throws ExtractionException {
135         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[4]
136                 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "event");
137         if (nodes.isEmpty())
138             return;
139         HEventExtractorFactoryrmats2/HEventExtractorFactory.html#HEventExtractorFactory">HEventExtractorFactory factory = new HEventExtractorFactory();
140         HEventExtractor extractor = factory.createExtractor();
141         for (Node node : nodes) {
142             BNode event = valueFactory.createBNode();
143             addIRIProperty(event, RDF.TYPE, vResume.experience);
144             extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), event, getCurrentExtractionResult());
145         }
146     }
147 
148     private void addEducations(HTMLDocument doc, Resource person) throws ExtractionException {
149         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[3]
150                 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "event");
151         if (nodes.isEmpty())
152             return;
153         HEventExtractorFactoryrmats2/HEventExtractorFactory.html#HEventExtractorFactory">HEventExtractorFactory factory = new HEventExtractorFactory();
154         HEventExtractor extractor = factory.createExtractor();
155         for (Node node : nodes) {
156             BNode event = valueFactory.createBNode();
157             addIRIProperty(event, RDF.TYPE, vResume.education);
158             extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), event, getCurrentExtractionResult());
159         }
160     }
161 }