View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.vocab.HResume;
25  import org.eclipse.rdf4j.model.BNode;
26  import org.eclipse.rdf4j.model.Resource;
27  import org.eclipse.rdf4j.model.vocabulary.RDF;
28  import org.w3c.dom.Node;
29  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
30  import org.apache.any23.extractor.html.HTMLDocument;
31  import org.apache.any23.extractor.html.DomUtils;
32  import java.util.List;
33  
34  /**
35   * Extractor for the <a href="http://microformats.org/wiki/hresume">hResume</a>
36   * microformat.
37   *
38   * @author Nisala Nirmana
39   */
40  public class HResumeExtractor extends EntityBasedMicroformatExtractor {
41  
42      private static final HResume vResume = HResume.getInstance();
43  
44      private static final String[] resumeFields = {
45              "name",
46              "summary",
47              "contact",
48              "education",
49              "experience",
50              "skill",
51              "affiliation"
52      };
53  
54      @Override
55      public ExtractorDescription getDescription() {
56          return HResumeExtractorFactory.getDescriptionInstance();
57      }
58  
59      @Override
60      public String getBaseClassName() {
61          return Microformats2Prefixes.CLASS_PREFIX + "resume";
62      }
63  
64      @Override
65      protected void resetExtractor() {
66          // Empty.
67      }
68  
69      @Override
70      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
71          if (null == node) return false;
72          BNode person = getBlankNodeFor(node);
73          out.writeTriple(person, RDF.TYPE, vResume.Resume);
74          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
75  
76          addName(fragment, person);
77          addSummary(fragment, person);
78          addSkills(fragment, person);
79  
80          addExperiences(fragment, person);
81          addEducations(fragment, person);
82  
83          addAffiliations(fragment, person);
84          addContacts(fragment,person);
85  
86          final TagSoupExtractionResult../org/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
87          tser.addResourceRoot(
88                  DomUtils.getXPathListForNode(node),
89                  person,
90                  this.getClass()
91          );
92  
93          return true;
94      }
95  
96      private void addContacts(HTMLDocument doc, Resource entry) throws ExtractionException {
97          List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[2] +
98                  Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
99          if (nodes.isEmpty())
100             return;
101         HCardExtractorFactoryormats2/HCardExtractorFactory.html#HCardExtractorFactory">HCardExtractorFactory factory = new HCardExtractorFactory();
102         HCardExtractor extractor = factory.createExtractor();
103         for (Node node : nodes) {
104             BNode contact = valueFactory.createBNode();
105             addIRIProperty(contact, RDF.TYPE, vResume.contact);
106             extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), contact,
107                     getCurrentExtractionResult());
108         }
109     }
110 
111     private void addAffiliations(HTMLDocument doc, Resource entry) throws ExtractionException {
112         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[6] +
113                 Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
114         if (nodes.isEmpty())
115             return;
116         HCardExtractorFactoryormats2/HCardExtractorFactory.html#HCardExtractorFactory">HCardExtractorFactory factory = new HCardExtractorFactory();
117         HCardExtractor extractor = factory.createExtractor();
118         for (Node node : nodes) {
119             BNode affiliation = valueFactory.createBNode();
120             addIRIProperty(affiliation, RDF.TYPE, vResume.affiliation);
121             extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), affiliation,
122                     getCurrentExtractionResult());
123         }
124     }
125 
126     private void addName(HTMLDocument doc, Resource person) {
127         HTMLDocument.TextField name = doc.getSingularTextField(
128                 Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[0]);
129         conditionallyAddStringProperty(
130                 name.source(),
131                 person,
132                 vResume.name,
133                 name.value()
134         );
135     }
136 
137     private void addSummary(HTMLDocument doc, Resource person) {
138         HTMLDocument.TextField summary = doc.getSingularTextField(
139                 Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[1]);
140         conditionallyAddStringProperty(
141                 summary.source(),
142                 person,
143                 vResume.summary,
144                 summary.value()
145         );
146     }
147 
148     private void addSkills(HTMLDocument doc, Resource person) {
149         final HTMLDocument.TextField[] skills = doc.getPluralTextField(
150                 Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[5]);
151         for (HTMLDocument.TextField skill : skills) {
152             conditionallyAddStringProperty(
153                     skill.source(),
154                     person,
155                     vResume.skill,
156                     skill.value()
157             );
158         }
159 
160     }
161 
162     private void addExperiences(HTMLDocument doc, Resource person) throws ExtractionException {
163         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[4] +
164                 Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "event");
165         if (nodes.isEmpty())
166             return;
167         HEventExtractorFactoryrmats2/HEventExtractorFactory.html#HEventExtractorFactory">HEventExtractorFactory factory = new HEventExtractorFactory();
168         HEventExtractor extractor = factory.createExtractor();
169         for (Node node : nodes) {
170             BNode event = valueFactory.createBNode();
171             addIRIProperty(event, RDF.TYPE, vResume.experience);
172             extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), event,
173                     getCurrentExtractionResult());
174         }
175     }
176 
177     private void addEducations(HTMLDocument doc, Resource person) throws ExtractionException {
178         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + resumeFields[3] +
179                 Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "event");
180         if (nodes.isEmpty())
181             return;
182         HEventExtractorFactoryrmats2/HEventExtractorFactory.html#HEventExtractorFactory">HEventExtractorFactory factory = new HEventExtractorFactory();
183         HEventExtractor extractor = factory.createExtractor();
184         for (Node node : nodes) {
185             BNode event = valueFactory.createBNode();
186             addIRIProperty(event, RDF.TYPE, vResume.education);
187             extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), event,
188                     getCurrentExtractionResult());
189         }
190     }
191 }