View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html.microformats2;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractionResult;
22  import org.apache.any23.extractor.ExtractorDescription;
23  import org.apache.any23.extractor.html.EntityBasedMicroformatExtractor;
24  import org.apache.any23.vocab.HEvent;
25  import org.apache.any23.vocab.VCard;
26  import org.eclipse.rdf4j.model.BNode;
27  import org.eclipse.rdf4j.model.Resource;
28  import org.eclipse.rdf4j.model.IRI;
29  import org.eclipse.rdf4j.model.vocabulary.RDF;
30  import org.w3c.dom.Node;
31  import org.apache.any23.extractor.html.HTMLDocument;
32  
33  import java.util.List;
34  
35  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
36  
37  /**
38   * Extractor for the <a href="http://microformats.org/wiki/h-event">h-event</a> microformat.
39   *
40   * @author Nisala Nirmana
41   */
42  public class HEventExtractor extends EntityBasedMicroformatExtractor {
43  
44      private static final HEvent vEvent = HEvent.getInstance();
45      private static final VCard vVCARD = VCard.getInstance();
46  
47      private String[] eventFields = { "name", "summary", "start", "end", "duration", "description", "url", "category",
48              "location", "attendee" };
49  
50      private static final String[] geoFields = { "latitude", "longitude", "altitude" };
51  
52      @Override
53      public ExtractorDescription getDescription() {
54          return HEventExtractorFactory.getDescriptionInstance();
55      }
56  
57      @Override
58      protected String getBaseClassName() {
59          return Microformats2Prefixes.CLASS_PREFIX + "event";
60      }
61  
62      @Override
63      protected void resetExtractor() {
64          // Empty.
65      }
66  
67      @Override
68      protected boolean extractEntity(Node node, ExtractionResult out) throws ExtractionException {
69          final BNode event = getBlankNodeFor(node);
70          conditionallyAddResourceProperty(event, RDF.TYPE, vEvent.event);
71          final HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
72          addName(fragment, event);
73          addSummary(fragment, event);
74          addStart(fragment, event);
75          addEnd(fragment, event);
76          addDuration(fragment, event);
77          addDescription(fragment, event);
78          addURLs(fragment, event);
79          addCategories(fragment, event);
80          addLocations(fragment, event);
81          addAttendees(fragment, event);
82          return true;
83      }
84  
85      public Resource extractEntityAsEmbeddedProperty(HTMLDocument fragment, BNode event, ExtractionResult out)
86              throws ExtractionException {
87          this.setCurrentExtractionResult(out);
88          addName(fragment, event);
89          addSummary(fragment, event);
90          addStart(fragment, event);
91          addEnd(fragment, event);
92          addDuration(fragment, event);
93          addDescription(fragment, event);
94          addURLs(fragment, event);
95          addCategories(fragment, event);
96          addLocations(fragment, event);
97          addAttendees(fragment, event);
98          return event;
99      }
100 
101     private void addAttendees(HTMLDocument doc, Resource entry) throws ExtractionException {
102         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + eventFields[9]
103                 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "card");
104         if (nodes.isEmpty())
105             return;
106         HCardExtractorFactoryormats2/HCardExtractorFactory.html#HCardExtractorFactory">HCardExtractorFactory factory = new HCardExtractorFactory();
107         HCardExtractor extractor = factory.createExtractor();
108         for (Node node : nodes) {
109             BNode attendee = valueFactory.createBNode();
110             addIRIProperty(attendee, RDF.TYPE, vEvent.attendee);
111             extractor.extractEntityAsEmbeddedProperty(new HTMLDocument(node), attendee, getCurrentExtractionResult());
112         }
113     }
114 
115     private void mapFieldWithProperty(HTMLDocument fragment, BNode recipe, String fieldClass, IRI property) {
116         HTMLDocument.TextField title = fragment.getSingularTextField(fieldClass);
117         conditionallyAddStringProperty(title.source(), recipe, property, title.value());
118     }
119 
120     private void addName(HTMLDocument fragment, BNode event) {
121         mapFieldWithProperty(fragment, event, Microformats2Prefixes.PROPERTY_PREFIX + eventFields[0], vEvent.name);
122     }
123 
124     private void addSummary(HTMLDocument fragment, BNode event) {
125         mapFieldWithProperty(fragment, event, Microformats2Prefixes.PROPERTY_PREFIX + eventFields[1], vEvent.summary);
126     }
127 
128     private void addStart(HTMLDocument fragment, BNode event) {
129         final TextField start = fragment
130                 .getSingularTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + eventFields[2]);
131         if (start.source() == null)
132             return;
133         Node attribute = start.source().getAttributes().getNamedItem("datetime");
134         if (attribute == null) {
135             conditionallyAddStringProperty(start.source(), event, vEvent.start, start.value());
136         } else {
137             conditionallyAddStringProperty(start.source(), event, vEvent.start, attribute.getNodeValue());
138         }
139     }
140 
141     private void addEnd(HTMLDocument fragment, BNode event) {
142         final TextField end = fragment
143                 .getSingularTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + eventFields[3]);
144         if (end.source() == null)
145             return;
146         Node attribute = end.source().getAttributes().getNamedItem("datetime");
147         if (attribute == null) {
148             conditionallyAddStringProperty(end.source(), event, vEvent.end, end.value());
149         } else {
150             conditionallyAddStringProperty(end.source(), event, vEvent.end, attribute.getNodeValue());
151         }
152     }
153 
154     private void addDuration(HTMLDocument fragment, BNode event) {
155         final TextField duration = fragment
156                 .getSingularTextField(Microformats2Prefixes.TIME_PROPERTY_PREFIX + eventFields[4]);
157         if (duration.source() == null)
158             return;
159         Node attribute = duration.source().getAttributes().getNamedItem("datetime");
160         if (attribute == null) {
161             conditionallyAddStringProperty(duration.source(), event, vEvent.duration, duration.value());
162         } else {
163             conditionallyAddStringProperty(duration.source(), event, vEvent.duration, attribute.getNodeValue());
164         }
165     }
166 
167     private void addDescription(HTMLDocument fragment, BNode event) {
168         mapFieldWithProperty(fragment, event, Microformats2Prefixes.PROPERTY_PREFIX + eventFields[5],
169                 vEvent.description);
170     }
171 
172     private void addURLs(HTMLDocument fragment, BNode event) throws ExtractionException {
173         final HTMLDocument.TextField[] urls = fragment
174                 .getPluralUrlField(Microformats2Prefixes.URL_PROPERTY_PREFIX + eventFields[6]);
175         for (HTMLDocument.TextField url : urls) {
176             addIRIProperty(event, vEvent.url, fragment.resolveIRI(url.value()));
177         }
178     }
179 
180     private void addCategories(HTMLDocument fragment, BNode event) {
181         final HTMLDocument.TextField[] categories = fragment
182                 .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + eventFields[7]);
183         for (HTMLDocument.TextField category : categories) {
184             conditionallyAddStringProperty(category.source(), event, vEvent.category, category.value());
185         }
186     }
187 
188     private void addLocations(HTMLDocument doc, Resource entry) throws ExtractionException {
189         List<Node> nodes = doc.findAllByClassName(Microformats2Prefixes.PROPERTY_PREFIX + eventFields[8]
190                 + Microformats2Prefixes.SPACE_SEPARATOR + Microformats2Prefixes.CLASS_PREFIX + "geo");
191         if (nodes.isEmpty())
192             return;
193         for (Node node : nodes) {
194             BNode location = valueFactory.createBNode();
195             addIRIProperty(location, RDF.TYPE, vEvent.location);
196             HTMLDocumentl/HTMLDocument.html#HTMLDocument">HTMLDocument fragment = new HTMLDocument(node);
197             for (String field : geoFields) {
198                 HTMLDocument.TextField[] values = fragment
199                         .getPluralTextField(Microformats2Prefixes.PROPERTY_PREFIX + field);
200                 for (HTMLDocument.TextField val : values) {
201                     Node attribute = val.source().getAttributes().getNamedItem("title");
202                     if (attribute == null) {
203                         conditionallyAddStringProperty(val.source(), location, vVCARD.getProperty(field), val.value());
204                     } else {
205                         conditionallyAddStringProperty(val.source(), location, vVCARD.getProperty(field),
206                                 attribute.getNodeValue());
207                     }
208                 }
209             }
210         }
211     }
212 
213 }