View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.TagSoupExtractionResult;
24  import org.apache.any23.extractor.html.HTMLDocument;
25  import org.apache.any23.vocab.HCard;
26  import org.eclipse.rdf4j.model.BNode;
27  import org.eclipse.rdf4j.model.Resource;
28  import org.eclipse.rdf4j.model.IRI;
29  import org.eclipse.rdf4j.model.vocabulary.RDF;
30  import org.w3c.dom.Node;
31  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
32  import org.apache.any23.extractor.html.DomUtils;
33  
34  import java.util.List;
35  
36  
37  /**
38   * Extractor for the <a href="http://microformats.org/wiki/hcard">h-Card</a>
39   * microformat.
40   *
41   * @author Nisala Nirmana
42   */
43  public class HCardExtractor extends EntityBasedMicroformatExtractor {
44  
45      private static final HCard vCARD = HCard.getInstance();
46  
47      private static final String[] cardFields = {
48              "name",
49              "honorific-prefix",
50              "given-name",
51              "additional-name",
52              "family-name",
53              "sort-string",
54              "honorific-suffix",
55              "nickname",
56              "email",
57              "logo",
58              "photo",
59              "url",
60              "uid",
61              "category",
62              "tel",
63              "note",
64              "bday",
65              "key",
66              "org",
67              "job-title",
68              "role",
69              "impp",
70              "sex",
71              "gender-identity",
72              "anniversary",
73              "adr",
74              "geo"
75      };
76  
77      private static final String[] addressFields = {
78              "street-address",
79              "extended-address",
80              "locality",
81              "region",
82              "postal-code",
83              "country-name",
84              "geo"
85      };
86  
87      private static final String[] geoFields = {
88              "latitude",
89              "longitude",
90              "altitude"
91      };
92  
93  
94  
95      @Override
96      public ExtractorDescription getDescription() {
97          return HCardExtractorFactory.getDescriptionInstance();
98      }
99  
100     @Override
101     protected String getBaseClassName() {
102         return Microformats2Prefixes.CLASS_PREFIX+"card";
103     }
104 
105     @Override
106     protected void resetExtractor() {
107         //empty
108     }
109 
110     @Override
111     protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
112         final BNode card = getBlankNodeFor(node);
113         conditionallyAddResourceProperty(card, RDF.TYPE, vCARD.Card);
114         final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
115         addName(fragment, card);
116         addHonorificPrefix(fragment, card);
117         addGivenName(fragment, card);
118         addAdditionalName(fragment, card);
119         addFamilyName(fragment, card);
120         addSortString(fragment, card);
121         addHonorificSuffix(fragment, card);
122         addNickname(fragment, card);
123         addEmails(fragment, card);
124         addLogo(fragment, card);
125         addPhoto(fragment, card);
126         addURLs(fragment, card);
127         addUID(fragment, card);
128         addCategories(fragment, card);
129         addTelephones(fragment, card);
130         addNotes(fragment, card);
131         addBday(fragment, card);
132         addKey(fragment, card);
133         addOrg(fragment, card);
134         addJobTitle(fragment, card);
135         addRole(fragment, card);
136         addImpp(fragment, card);
137         addSex(fragment, card);
138         addGenderIdentity(fragment, card);
139         addAnniversary(fragment, card);
140         addGeo(fragment, card);
141         addAdr(fragment, card);
142         final TagSoupExtractionResult../org/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) out;
143         tser.addResourceRoot( DomUtils.getXPathListForNode(node), card, this.getClass() );
144         return true;
145     }
146 
147     public Resource extractEntityAsEmbeddedProperty(HTMLDocument fragment, BNode card,
148                                                     ExtractionResult out)
149             throws ExtractionException {
150         this.setCurrentExtractionResult(out);
151         addName(fragment, card);
152         addHonorificPrefix(fragment, card);
153         addGivenName(fragment, card);
154         addAdditionalName(fragment, card);
155         addFamilyName(fragment, card);
156         addSortString(fragment, card);
157         addHonorificSuffix(fragment, card);
158         addNickname(fragment, card);
159         addEmails(fragment, card);
160         addLogo(fragment, card);
161         addPhoto(fragment, card);
162         addURLs(fragment, card);
163         addUID(fragment, card);
164         addCategories(fragment, card);
165         addTelephones(fragment, card);
166         addNotes(fragment, card);
167         addBday(fragment, card);
168         addKey(fragment, card);
169         addOrg(fragment, card);
170         addJobTitle(fragment, card);
171         addRole(fragment, card);
172         addImpp(fragment, card);
173         addSex(fragment, card);
174         addGenderIdentity(fragment, card);
175         addAnniversary(fragment, card);
176         addGeo(fragment, card);
177         addAdr(fragment, card);
178         return card;
179     }
180 
181 
182 
183     private void mapFieldWithProperty(HTMLDocument fragment, BNode card, String fieldClass,
184                                       IRI property) {
185         HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
186         conditionallyAddStringProperty(
187                 title.source(), card, property, title.value()
188         );
189     }
190 
191     private void addName(HTMLDocument fragment, BNode card) {
192         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
193                 cardFields[0], vCARD.name);
194     }
195 
196     private void addHonorificPrefix(HTMLDocument fragment, BNode card) {
197         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
198                 cardFields[1], vCARD.honorific_prefix);
199     }
200 
201     private void addGivenName(HTMLDocument fragment, BNode card) {
202         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
203                 cardFields[2], vCARD.given_name);
204     }
205 
206     private void addAdditionalName(HTMLDocument fragment, BNode card) {
207         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
208                 cardFields[3], vCARD.additional_name);
209     }
210 
211     private void addFamilyName(HTMLDocument fragment, BNode card) {
212         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
213                 cardFields[4], vCARD.family_name);
214     }
215 
216     private void addSortString(HTMLDocument fragment, BNode card) {
217         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
218                 cardFields[5], vCARD.sort_string);
219     }
220 
221     private void addHonorificSuffix(HTMLDocument fragment, BNode card) {
222         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
223                 cardFields[6], vCARD.honorific_suffix);
224     }
225 
226     private void addNickname(HTMLDocument fragment, BNode card) {
227         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
228                 cardFields[7], vCARD.nickname);
229     }
230 
231     private void addEmails(HTMLDocument fragment, BNode card) throws ExtractionException {
232         final HTMLDocument.TextField[] emails = fragment.getPluralUrlField
233                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[8]);
234         for(HTMLDocument.TextField email : emails) {
235             addIRIProperty(card, vCARD.email, fragment.resolveIRI(email.value()));
236 
237         }
238     }
239 
240     private void addLogo(HTMLDocument fragment, BNode card) throws ExtractionException {
241         final HTMLDocument.TextField logo = fragment.getSingularUrlField
242                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[9]);
243         if(logo.source()==null)
244             return;
245         addIRIProperty(card, vCARD.logo, fragment.resolveIRI(logo.value()));
246     }
247 
248     private void addPhoto(HTMLDocument fragment, BNode card) throws ExtractionException {
249         final HTMLDocument.TextField photo = fragment.getSingularUrlField
250                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[10]);
251         if(photo.source()==null)
252             return;
253         addIRIProperty(card, vCARD.photo, fragment.resolveIRI(photo.value()));
254     }
255 
256     private void addURLs(HTMLDocument fragment, BNode card) throws ExtractionException {
257         final HTMLDocument.TextField[] urls = fragment.getPluralUrlField
258                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[11]);
259         for(HTMLDocument.TextField url : urls) {
260             addIRIProperty(card, vCARD.url, fragment.resolveIRI(url.value()));
261 
262         }
263     }
264 
265     private void addUID(HTMLDocument fragment, BNode card) throws ExtractionException {
266         final HTMLDocument.TextField uid = fragment.getSingularUrlField
267                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[12]);
268         if(uid.source()==null)
269             return;
270         addIRIProperty(card, vCARD.uid, fragment.resolveIRI(uid.value()));
271     }
272 
273 
274     private void addCategories(HTMLDocument fragment, BNode entry) {
275         final HTMLDocument.TextField[] categories = fragment.getPluralTextField
276                 (Microformats2Prefixes.PROPERTY_PREFIX + cardFields[13]);
277         for (HTMLDocument.TextField category : categories) {
278             conditionallyAddStringProperty(
279                     category.source(), entry, vCARD.category, category.value()
280             );
281         }
282     }
283 
284     private void addTelephones(HTMLDocument fragment, BNode card) {
285         final HTMLDocument.TextField[] telephones = fragment.getPluralTextField
286                 (Microformats2Prefixes.PROPERTY_PREFIX + cardFields[14]);
287         for (HTMLDocument.TextField tel : telephones) {
288             Node attribute=tel.source().getAttributes().getNamedItem("value");
289             if (attribute==null){
290                 conditionallyAddStringProperty(
291                         tel.source(), card, vCARD.tel, tel.value()
292                 );
293             }else{
294                 conditionallyAddStringProperty(
295                         tel.source(), card, vCARD.tel, attribute.getNodeValue()
296                 );
297             }
298         }
299     }
300 
301     private void addNotes(HTMLDocument fragment, BNode entry) {
302         final HTMLDocument.TextField[] categories = fragment.getPluralTextField
303                 (Microformats2Prefixes.PROPERTY_PREFIX + cardFields[15]);
304         for (HTMLDocument.TextField category : categories) {
305             conditionallyAddStringProperty(
306                     category.source(), entry, vCARD.note, category.value()
307             );
308         }
309     }
310 
311     private void addBday(HTMLDocument fragment, BNode card) {
312         final HTMLDocument.TextField bday = fragment.getSingularTextField(
313                 Microformats2Prefixes.TIME_PROPERTY_PREFIX + cardFields[16]);
314         if (bday.source() == null)
315             return;
316 
317         Node attribute = bday.source().getAttributes().getNamedItem("datetime");
318         if (attribute == null) {
319             conditionallyAddStringProperty(
320                     bday.source(),
321                     card, vCARD.bday, bday.value()
322             );
323         } else {
324             conditionallyAddStringProperty(
325                     bday.source(),
326                     card, vCARD.bday, attribute.getNodeValue()
327             );
328 
329         }
330     }
331 
332     private void addKey(HTMLDocument fragment, BNode card) throws ExtractionException {
333         final HTMLDocument.TextField uid = fragment.getSingularTextField
334                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[17]);
335         if(uid.source()==null)
336             return;
337         addIRIProperty(card, vCARD.key, fragment.resolveIRI(uid.value()));
338     }
339 
340     private void addOrg(HTMLDocument fragment, BNode card) {
341         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
342                 cardFields[18], vCARD.org);
343     }
344 
345     private void addJobTitle(HTMLDocument fragment, BNode card) {
346         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
347                 cardFields[19], vCARD.job_title);
348     }
349 
350     private void addRole(HTMLDocument fragment, BNode card) {
351         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
352                 cardFields[20], vCARD.role);
353     }
354 
355     private void addImpp(HTMLDocument fragment, BNode card) throws ExtractionException {
356         final HTMLDocument.TextField impp = fragment.getSingularTextField
357                 (Microformats2Prefixes.URL_PROPERTY_PREFIX + cardFields[21]);
358         if(impp.source()==null)
359             return;
360         addIRIProperty(card, vCARD.impp, fragment.resolveIRI(impp.value()));
361     }
362 
363     private void addSex(HTMLDocument fragment, BNode card) {
364         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
365                 cardFields[22], vCARD.sex);
366     }
367 
368     private void addGenderIdentity(HTMLDocument fragment, BNode card) {
369         mapFieldWithProperty(fragment, card, Microformats2Prefixes.PROPERTY_PREFIX +
370                 cardFields[23], vCARD.gender_identity);
371     }
372 
373 
374     private void addAnniversary(HTMLDocument fragment, BNode card) {
375         final HTMLDocument.TextField anniversary = fragment.getSingularTextField(
376                 Microformats2Prefixes.TIME_PROPERTY_PREFIX + cardFields[24]);
377         if (anniversary.source() == null)
378             return;
379 
380         Node attribute = anniversary.source().getAttributes().getNamedItem("datetime");
381         if (attribute == null) {
382             conditionallyAddStringProperty(
383                     anniversary.source(),
384                     card, vCARD.bday, anniversary.value()
385             );
386         } else {
387             conditionallyAddStringProperty(
388                     anniversary.source(),
389                     card, vCARD.bday, attribute.getNodeValue()
390             );
391 
392         }
393     }
394 
395     private void addAdr(HTMLDocument doc, Resource card) throws ExtractionException {
396         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[25] +
397                 Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + cardFields[25]);
398         if (nodes.isEmpty())
399             return;
400         for (Node node : nodes) {
401             BNode location = valueFactory.createBNode();
402             addIRIProperty(location, RDF.TYPE, vCARD.Address);
403             HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
404             for (String field : addressFields) {
405                 HTMLDocument.TextField[] values = fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field);
406                 for (HTMLDocument.TextField val : values) {
407                     if(!field.equals("geo")) {
408                         conditionallyAddStringProperty(
409                                 val.source(),
410                                 location, vCARD.getProperty(field), val.value()
411                         );
412                     }else {
413                         addGeo(new HTMLDocument(node),card);
414                     }
415                 }
416             }
417         }
418     }
419 
420     private void addGeo(HTMLDocument doc, Resource card) throws ExtractionException {
421         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + cardFields[26] +
422                 Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + cardFields[26]);
423         if (nodes.isEmpty())
424             return;
425         for (Node node : nodes) {
426             BNode location = valueFactory.createBNode();
427             addIRIProperty(location, RDF.TYPE, vCARD.Geo);
428             HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
429             for (String field : geoFields) {
430                 HTMLDocument.TextField[] values = fragment.getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX+field);
431                 for (HTMLDocument.TextField val : values) {
432                     Node attribute=val.source().getAttributes().getNamedItem("title");
433                     if (attribute==null){
434                         conditionallyAddStringProperty(
435                                 val.source(),
436                                 location, vCARD.getProperty(field), val.value()
437                         );
438                     }else{
439                         conditionallyAddStringProperty(
440                                 val.source(),
441                                 location, vCARD.getProperty(field), attribute.getNodeValue()
442                         );
443                     }
444                 }
445             }
446         }
447     }
448 
449 }