View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractorDescription;
22  import org.apache.any23.extractor.ExtractorFactory;
23  import org.apache.any23.extractor.SimpleExtractorFactory;
24  import org.apache.any23.extractor.TagSoupExtractionResult;
25  import org.apache.any23.rdf.PopularPrefixes;
26  import org.apache.any23.rdf.RDFUtils;
27  import org.apache.any23.vocab.ICAL;
28  import org.openrdf.model.BNode;
29  import org.openrdf.model.Resource;
30  import org.openrdf.model.URI;
31  import org.openrdf.model.vocabulary.RDF;
32  import org.w3c.dom.Node;
33  
34  import javax.xml.datatype.DatatypeConfigurationException;
35  import java.text.ParseException;
36  import java.util.Arrays;
37  import java.util.List;
38  
39  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
40  
41  
42  /**
43   * Extractor for the <a href="http://microformats.org/wiki/hcalendar">hCalendar</a>
44   * microformat.
45   *
46   * @author Gabriele Renzi
47   */
48  public class HCalendarExtractor extends MicroformatExtractor {
49  
50      private static final ICAL vICAL = ICAL.getInstance();
51  
52      public final static ExtractorFactory<HCalendarExtractor> factory =
53              SimpleExtractorFactory.create(
54                      "html-mf-hcalendar",
55                      PopularPrefixes.createSubset("rdf", "ical"),
56                      Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
57                      "example-mf-hcalendar.html",
58                      HCalendarExtractor.class);
59  
60      private static final String[] Components = {"Vevent", "Vtodo", "Vjournal", "Vfreebusy"};
61  
62      private static final String DATE_FORMAT = "yyyyMMdd'T'HHmm'Z'";
63  
64      private String[] textSingularProps = {
65              "summary",
66              "class",
67              "transp",
68              "description",
69              "status",
70              "location"};
71  
72      private String[] textDateProps = {
73              "dtstart",
74              "dtstamp",
75              "dtend",
76      };
77  
78      public ExtractorDescription getDescription() {
79          return factory;
80      }
81  
82      @Override
83      protected boolean extract() throws ExtractionException {
84          final HTMLDocument document = getHTMLDocument();
85          List<Node> calendars = document.findAllByClassName("vcalendar");
86          if (calendars.size() == 0)
87              // vcal allows to avoid top name, in which case whole document is
88              // the calendar, let's try
89              if (document.findAllByClassName("vevent").size() > 0)
90                  calendars.add(document.getDocument());
91  
92          boolean foundAny = false;
93          for (Node node : calendars)
94              foundAny |= extractCalendar(node);
95  
96          return foundAny;
97      }
98  
99      private boolean extractCalendar(Node node) throws ExtractionException {
100         URI cal = getDocumentURI();
101         addURIProperty(cal, RDF.TYPE, vICAL.Vcalendar);
102         return addComponents(node, cal);
103     }
104 
105     private boolean addComponents(Node node, Resource cal) throws ExtractionException {
106         boolean foundAny = false;
107         for (String component : Components) {
108             List<Node> events = DomUtils.findAllByClassName(node, component);
109             if (events.size() == 0)
110                 continue;
111             for (Node evtNode : events)
112                 foundAny |= extractComponent(evtNode, cal, component);
113         }
114         return foundAny;
115     }
116 
117     private boolean extractComponent(Node node, Resource cal, String component) throws ExtractionException {
118         HTMLDocument compoNode = new HTMLDocument(node);
119         BNode evt = valueFactory.createBNode();
120         addURIProperty(evt, RDF.TYPE, vICAL.getClass(component));
121         addTextProps(compoNode, evt);
122         addUrl(compoNode, evt);
123         addRRule(compoNode, evt);
124         addOrganizer(compoNode, evt);
125         addUid(compoNode, evt);
126         addBNodeProperty(cal, vICAL.component, evt);
127 
128         final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
129         tser.addResourceRoot( compoNode.getPathToLocalRoot(), evt, this.getClass() );
130 
131         return true;
132     }
133 
134     private void addUid(HTMLDocument compoNode, Resource evt) {
135         TextField url = compoNode.getSingularUrlField("uid");
136         conditionallyAddStringProperty(
137                 compoNode.getDocument(),
138                 evt, vICAL.uid, url.value()
139         );
140     }
141 
142     private void addUrl(HTMLDocument compoNode, Resource evt) throws ExtractionException {
143         TextField url = compoNode.getSingularUrlField("url");
144         if ("".equals(url.value())) return;
145         addURIProperty(evt, vICAL.url, getHTMLDocument().resolveURI(url.value()));
146     }
147 
148     private void addRRule(HTMLDocument compoNode, Resource evt) {
149         for (Node rule : compoNode.findAllByClassName("rrule")) {
150             BNode rrule = valueFactory.createBNode();
151             addURIProperty(rrule, RDF.TYPE, vICAL.DomainOf_rrule);
152             TextField freq = new HTMLDocument(rule).getSingularTextField("freq");
153             conditionallyAddStringProperty(
154                     freq.source(),
155                     rrule, vICAL.freq, freq.value()
156             );
157             addBNodeProperty(
158                     rule,
159                     evt, vICAL.rrule, rrule
160             );
161         }
162     }
163 
164     private void addOrganizer(HTMLDocument compoNode, Resource evt) {
165         for (Node organizer : compoNode.findAllByClassName("organizer")) {
166             //untyped
167             BNode blank = valueFactory.createBNode();
168             TextField mail = new HTMLDocument(organizer).getSingularUrlField("organizer");
169             conditionallyAddStringProperty(
170                     compoNode.getDocument(),
171                     blank, vICAL.calAddress, mail.value()
172             );
173             addBNodeProperty(
174                     organizer,
175                     evt, vICAL.organizer, blank
176             );
177         }
178     }
179 
180     private void addTextProps(HTMLDocument node, Resource evt) {
181         for (String date : textSingularProps) {
182             HTMLDocument.TextField val = node.getSingularTextField(date);
183             conditionallyAddStringProperty(
184                     val.source(),
185                     evt, vICAL.getProperty(date), val.value()
186             );
187         }
188 
189         for (String date : textDateProps) {
190             HTMLDocument.TextField val = node.getSingularTextField(date);
191             try {
192                 conditionallyAddStringProperty(
193                         val.source(),
194                         evt,
195                         vICAL.getProperty(date),
196                         RDFUtils.getXSDDate(
197                                 val.value(),
198                                 DATE_FORMAT
199                         )
200                 );
201             } catch (ParseException e) {
202                 // Unparsable date format just leave it as it is.
203                 conditionallyAddStringProperty( val.source(), evt, vICAL.getProperty(date), val.value());
204             } catch (DatatypeConfigurationException e) {
205                 // Unparsable date format just leave it as it is
206                 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
207             }
208         }
209 
210         HTMLDocument.TextField[] values = node.getPluralTextField("category");
211         for (TextField val : values) {
212             conditionallyAddStringProperty(val.source(), evt, vICAL.categories, val.value());
213         }
214     }
215 
216 }