View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.extractor.html.HTMLDocument;
25  import org.apache.any23.vocab.HCard;
26  import org.eclipse.rdf4j.model.BNode;
27  import org.eclipse.rdf4j.model.Resource;
28  import org.eclipse.rdf4j.model.IRI;
29  import org.eclipse.rdf4j.model.vocabulary.RDF;
30  import org.w3c.dom.Node;
31  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
32  import org.apache.any23.extractor.html.DomUtils;
33  
34  import java.util.List;
35  
36  /**
37   * Extractor for the <a href="http://microformats.org/wiki/hcard">h-Card</a> microformat.
38   *
39   * @author Nisala Nirmana
40   */
41  public class HCardExtractor extends EntityBasedMicroformatExtractor {
42  
43      private static final HCard vCARD = HCard.getInstance();
44  
45      private static final String[] cardFields = { "name", "honorific-prefix", "given-name", "additional-name",
46              "family-name", "sort-string", "honorific-suffix", "nickname", "email", "logo", "photo", "url", "uid",
47              "category", "tel", "note", "bday", "key", "org", "job-title", "role", "impp", "sex", "gender-identity",
48              "anniversary", "adr", "geo" };
49  
50      private static final String[] addressFields = { "street-address", "extended-address", "locality", "region",
51              "postal-code", "country-name", "geo" };
52  
53      private static final String[] geoFields = { "latitude", "longitude", "altitude" };
54  
55      @Override
56      public ExtractorDescription getDescription() {
57          return HCardExtractorFactory.getDescriptionInstance();
58      }
59  
60      @Override
61      protected String getBaseClassName() {
62          return Microformats2Prefixes.CLASS_PREFIX + "card";
63      }
64  
65      @Override
66      protected void resetExtractor() {
67          // empty
68      }
69  
70      @Override
71      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
72          final BNode card = getBlankNodeFor(node);
73          conditionallyAddResourceProperty(card, RDF.TYPE, vCARD.Card);
74          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
75          addName(fragment, card);
76          addHonorificPrefix(fragment, card);
77          addGivenName(fragment, card);
78          addAdditionalName(fragment, card);
79          addFamilyName(fragment, card);
80          addSortString(fragment, card);
81          addHonorificSuffix(fragment, card);
82          addNickname(fragment, card);
83          addEmails(fragment, card);
84          addLogo(fragment, card);
85          addPhoto(fragment, card);
86          addURLs(fragment, card);
87          addUID(fragment, card);
88          addCategories(fragment, card);
89          addTelephones(fragment, card);
90          addNotes(fragment, card);
91          addBday(fragment, card);
92          addKey(fragment, card);
93          addOrg(fragment, card);
94          addJobTitle(fragment, card);
95          addRole(fragment, card);
96          addImpp(fragment, card);
97          addSex(fragment, card);
98          addGenderIdentity(fragment, card);
99          addAnniversary(fragment, card);
100         addGeo(fragment, card);
101         addAdr(fragment, card);
102         final TagSoupExtractionResult../org/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
103         tser.addResourceRoot(DomUtils.getXPathListForNode(node), card, this.getClass());
104         return true;
105     }
106 
107     public Resource extractEntityAsEmbeddedProperty(HTMLDocument fragment, BNode card, ExtractionResult out)
108             throws ExtractionException {
109         this.setCurrentExtractionResult(out);
110         addName(fragment, card);
111         addHonorificPrefix(fragment, card);
112         addGivenName(fragment, card);
113         addAdditionalName(fragment, card);
114         addFamilyName(fragment, card);
115         addSortString(fragment, card);
116         addHonorificSuffix(fragment, card);
117         addNickname(fragment, card);
118         addEmails(fragment, card);
119         addLogo(fragment, card);
120         addPhoto(fragment, card);
121         addURLs(fragment, card);
122         addUID(fragment, card);
123         addCategories(fragment, card);
124         addTelephones(fragment, card);
125         addNotes(fragment, card);
126         addBday(fragment, card);
127         addKey(fragment, card);
128         addOrg(fragment, card);
129         addJobTitle(fragment, card);
130         addRole(fragment, card);
131         addImpp(fragment, card);
132         addSex(fragment, card);
133         addGenderIdentity(fragment, card);
134         addAnniversary(fragment, card);
135         addGeo(fragment, card);
136         addAdr(fragment, card);
137         return card;
138     }
139 
140     private void mapFieldWithProperty(HTMLDocument fragment, BNode card, String fieldClass, IRI property) {
141         HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
142         conditionallyAddStringProperty(title.source(), card, property, title.value());
143     }
144 
145     private void addName(HTMLDocument fragment, BNode card) {
146         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[0], vCARD.name);
147     }
148 
149     private void addHonorificPrefix(HTMLDocument fragment, BNode card) {
150         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[1],
151                 vCARD.honorific_prefix);
152     }
153 
154     private void addGivenName(HTMLDocument fragment, BNode card) {
155         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[2], vCARD.given_name);
156     }
157 
158     private void addAdditionalName(HTMLDocument fragment, BNode card) {
159         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[3],
160                 vCARD.additional_name);
161     }
162 
163     private void addFamilyName(HTMLDocument fragment, BNode card) {
164         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[4], vCARD.family_name);
165     }
166 
167     private void addSortString(HTMLDocument fragment, BNode card) {
168         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[5], vCARD.sort_string);
169     }
170 
171     private void addHonorificSuffix(HTMLDocument fragment, BNode card) {
172         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[6],
173                 vCARD.honorific_suffix);
174     }
175 
176     private void addNickname(HTMLDocument fragment, BNode card) {
177         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[7], vCARD.nickname);
178     }
179 
180     private void addEmails(HTMLDocument fragment, BNode card) throws ExtractionException {
181         final HTMLDocument.TextField[] emails = fragment
182                 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[8]);
183         for (HTMLDocument.TextField email : emails) {
184             addIRIProperty(card, vCARD.email, fragment.resolveIRI(email.value()));
185 
186         }
187     }
188 
189     private void addLogo(HTMLDocument fragment, BNode card) throws ExtractionException {
190         final HTMLDocument.TextField logo = fragment
191                 .getSingularUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[9]);
192         if (logo.source() == null)
193             return;
194         addIRIProperty(card, vCARD.logo, fragment.resolveIRI(logo.value()));
195     }
196 
197     private void addPhoto(HTMLDocument fragment, BNode card) throws ExtractionException {
198         final HTMLDocument.TextField photo = fragment
199                 .getSingularUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[10]);
200         if (photo.source() == null)
201             return;
202         addIRIProperty(card, vCARD.photo, fragment.resolveIRI(photo.value()));
203     }
204 
205     private void addURLs(HTMLDocument fragment, BNode card) throws ExtractionException {
206         final HTMLDocument.TextField[] urls = fragment
207                 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[11]);
208         for (HTMLDocument.TextField url : urls) {
209             addIRIProperty(card, vCARD.url, fragment.resolveIRI(url.value()));
210 
211         }
212     }
213 
214     private void addUID(HTMLDocument fragment, BNode card) throws ExtractionException {
215         final HTMLDocument.TextField uid = fragment
216                 .getSingularUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[12]);
217         if (uid.source() == null)
218             return;
219         addIRIProperty(card, vCARD.uid, fragment.resolveIRI(uid.value()));
220     }
221 
222     private void addCategories(HTMLDocument fragment, BNode entry) {
223         final HTMLDocument.TextField[] categories = fragment
224                 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[13]);
225         for (HTMLDocument.TextField category : categories) {
226             conditionallyAddStringProperty(category.source(), entry, vCARD.category, category.value());
227         }
228     }
229 
230     private void addTelephones(HTMLDocument fragment, BNode card) {
231         final HTMLDocument.TextField[] telephones = fragment
232                 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[14]);
233         for (HTMLDocument.TextField tel : telephones) {
234             Node attribute = tel.source().getAttributes().getNamedItem("value");
235             if (attribute == null) {
236                 conditionallyAddStringProperty(tel.source(), card, vCARD.tel, tel.value());
237             } else {
238                 conditionallyAddStringProperty(tel.source(), card, vCARD.tel, attribute.getNodeValue());
239             }
240         }
241     }
242 
243     private void addNotes(HTMLDocument fragment, BNode entry) {
244         final HTMLDocument.TextField[] categories = fragment
245                 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[15]);
246         for (HTMLDocument.TextField category : categories) {
247             conditionallyAddStringProperty(category.source(), entry, vCARD.note, category.value());
248         }
249     }
250 
251     private void addBday(HTMLDocument fragment, BNode card) {
252         final HTMLDocument.TextField bday = fragment
253                 .getSingularTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + cardFields[16]);
254         if (bday.source() == null)
255             return;
256 
257         Node attribute = bday.source().getAttributes().getNamedItem("datetime");
258         if (attribute == null) {
259             conditionallyAddStringProperty(bday.source(), card, vCARD.bday, bday.value());
260         } else {
261             conditionallyAddStringProperty(bday.source(), card, vCARD.bday, attribute.getNodeValue());
262 
263         }
264     }
265 
266     private void addKey(HTMLDocument fragment, BNode card) throws ExtractionException {
267         final HTMLDocument.TextField uid = fragment
268                 .getSingularTextField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[17]);
269         if (uid.source() == null)
270             return;
271         addIRIProperty(card, vCARD.key, fragment.resolveIRI(uid.value()));
272     }
273 
274     private void addOrg(HTMLDocument fragment, BNode card) {
275         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[18], vCARD.org);
276     }
277 
278     private void addJobTitle(HTMLDocument fragment, BNode card) {
279         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[19], vCARD.job_title);
280     }
281 
282     private void addRole(HTMLDocument fragment, BNode card) {
283         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[20], vCARD.role);
284     }
285 
286     private void addImpp(HTMLDocument fragment, BNode card) throws ExtractionException {
287         final HTMLDocument.TextField impp = fragment
288                 .getSingularTextField(Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[21]);
289         if (impp.source() == null)
290             return;
291         addIRIProperty(card, vCARD.impp, fragment.resolveIRI(impp.value()));
292     }
293 
294     private void addSex(HTMLDocument fragment, BNode card) {
295         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[22], vCARD.sex);
296     }
297 
298     private void addGenderIdentity(HTMLDocument fragment, BNode card) {
299         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX + cardFields[23],
300                 vCARD.gender_identity);
301     }
302 
303     private void addAnniversary(HTMLDocument fragment, BNode card) {
304         final HTMLDocument.TextField anniversary = fragment
305                 .getSingularTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + cardFields[24]);
306         if (anniversary.source() == null)
307             return;
308 
309         Node attribute = anniversary.source().getAttributes().getNamedItem("datetime");
310         if (attribute == null) {
311             conditionallyAddStringProperty(anniversary.source(), card, vCARD.bday, anniversary.value());
312         } else {
313             conditionallyAddStringProperty(anniversary.source(), card, vCARD.bday, attribute.getNodeValue());
314 
315         }
316     }
317 
318     private void addAdr(HTMLDocument doc, Resource card) throws ExtractionException {
319         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[25]
320                 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + cardFields[25]);
321         if (nodes.isEmpty())
322             return;
323         for (Node node : nodes) {
324             BNode location = valueFactory.createBNode();
325             addIRIProperty(location, RDF.TYPE, vCARD.Address);
326             HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
327             for (String field : addressFields) {
328                 HTMLDocument.TextField[] values = fragment
329                         .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + field);
330                 for (HTMLDocument.TextField val : values) {
331                     if (!field.equals("geo")) {
332                         conditionallyAddStringProperty(val.source(), location, vCARD.getProperty(field), val.value());
333                     } else {
334                         addGeo(new HTMLDocument(node), card);
335                     }
336                 }
337             }
338         }
339     }
340 
341     private void addGeo(HTMLDocument doc, Resource card) throws ExtractionException {
342         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[26]
343                 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + cardFields[26]);
344         if (nodes.isEmpty())
345             return;
346         for (Node node : nodes) {
347             BNode location = valueFactory.createBNode();
348             addIRIProperty(location, RDF.TYPE, vCARD.Geo);
349             HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
350             for (String field : geoFields) {
351                 HTMLDocument.TextField[] values = fragment
352                         .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + field);
353                 for (HTMLDocument.TextField val : values) {
354                     Node attribute = val.source().getAttributes().getNamedItem("title");
355                     if (attribute == null) {
356                         conditionallyAddStringProperty(val.source(), location, vCARD.getProperty(field), val.value());
357                     } else {
358                         conditionallyAddStringProperty(val.source(), location, vCARD.getProperty(field),
359                                 attribute.getNodeValue());
360                     }
361                 }
362             }
363         }
364     }
365 
366 }