View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractorDescription;
22  import org.apache.any23.extractor.TagSoupExtractionResult;
23  import org.apache.any23.rdf.RDFUtils;
24  import org.apache.any23.vocab.ICAL;
25  import org.eclipse.rdf4j.model.BNode;
26  import org.eclipse.rdf4j.model.Resource;
27  import org.eclipse.rdf4j.model.IRI;
28  import org.eclipse.rdf4j.model.vocabulary.RDF;
29  import org.w3c.dom.Node;
30  
31  import javax.xml.datatype.DatatypeConfigurationException;
32  import java.text.ParseException;
33  import java.util.List;
34  
35  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
36  
37  /**
38   * Extractor for the <a href="http://microformats.org/wiki/hcalendar">hCalendar</a> microformat.
39   *
40   * @author Gabriele Renzi
41   */
42  public class HCalendarExtractor extends MicroformatExtractor {
43  
44      private static final ICAL vICAL = ICAL.getInstance();
45  
46      private static final String[] Components = { "Vevent", "Vtodo", "Vjournal", "Vfreebusy" };
47  
48      private static final String DATE_FORMAT = "yyyyMMdd'T'HHmm'Z'";
49  
50      private String[] textSingularProps = { "summary", "class", "transp", "description", "status", "location" };
51  
52      private String[] textDateProps = { "dtstart", "dtstamp", "dtend", };
53  
54      @Override
55      public ExtractorDescription getDescription() {
56          return HCalendarExtractorFactory.getDescriptionInstance();
57      }
58  
59      @Override
60      protected boolean extract() throws ExtractionException {
61          final HTMLDocument document = getHTMLDocument();
62          List<Node> calendars = document.findAllByClassName("vcalendar");
63          if (calendars.size() == 0)
64              // vcal allows to avoid top name, in which case whole document is
65              // the calendar, let's try
66              if (document.findAllByClassName("vevent").size() > 0)
67                  calendars.add(document.getDocument());
68  
69          boolean foundAny = false;
70          for (Node node : calendars)
71              foundAny |= extractCalendar(node);
72  
73          return foundAny;
74      }
75  
76      private boolean extractCalendar(Node node) throws ExtractionException {
77          IRI cal = getDocumentIRI();
78          addIRIProperty(cal, RDF.TYPE, vICAL.Vcalendar);
79          return addComponents(node, cal);
80      }
81  
82      private boolean addComponents(Node node, Resource cal) throws ExtractionException {
83          boolean foundAny = false;
84          for (String component : Components) {
85              List<Node> events = DomUtils.findAllByClassName(node, component);
86              if (events.size() == 0)
87                  continue;
88              for (Node evtNode : events)
89                  foundAny |= extractComponent(evtNode, cal, component);
90          }
91          return foundAny;
92      }
93  
94      private boolean extractComponent(Node node, Resource cal, String component) throws ExtractionException {
95          HTMLDocumentMLDocument.html#HTMLDocument">HTMLDocument compoNode = new HTMLDocument(node);
96          BNode evt = valueFactory.createBNode();
97          addIRIProperty(evt, RDF.TYPE, vICAL.getClass(component));
98          addTextProps(compoNode, evt);
99          addUrl(compoNode, evt);
100         addRRule(compoNode, evt);
101         addOrganizer(compoNode, evt);
102         addUid(compoNode, evt);
103         addBNodeProperty(cal, vICAL.component, evt);
104 
105         final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
106         tser.addResourceRoot(compoNode.getPathToLocalRoot(), evt, this.getClass());
107 
108         return true;
109     }
110 
111     private void addUid(HTMLDocument compoNode, Resource evt) {
112         TextField url = compoNode.getSingularUrlField("uid");
113         conditionallyAddStringProperty(compoNode.getDocument(), evt, vICAL.uid, url.value());
114     }
115 
116     private void addUrl(HTMLDocument compoNode, Resource evt) throws ExtractionException {
117         TextField url = compoNode.getSingularUrlField("url");
118         if ("".equals(url.value()))
119             return;
120         addIRIProperty(evt, vICAL.url, getHTMLDocument().resolveIRI(url.value()));
121     }
122 
123     private void addRRule(HTMLDocument compoNode, Resource evt) {
124         for (Node rule : compoNode.findAllByClassName("rrule")) {
125             BNode rrule = valueFactory.createBNode();
126             addIRIProperty(rrule, RDF.TYPE, vICAL.DomainOf_rrule);
127             TextField freq = new HTMLDocument(rule).getSingularTextField("freq");
128             conditionallyAddStringProperty(freq.source(), rrule, vICAL.freq, freq.value());
129             addBNodeProperty(rule, evt, vICAL.rrule, rrule);
130         }
131     }
132 
133     private void addOrganizer(HTMLDocument compoNode, Resource evt) {
134         for (Node organizer : compoNode.findAllByClassName("organizer")) {
135             // untyped
136             BNode blank = valueFactory.createBNode();
137             TextField mail = new HTMLDocument(organizer).getSingularUrlField("organizer");
138             conditionallyAddStringProperty(compoNode.getDocument(), blank, vICAL.calAddress, mail.value());
139             addBNodeProperty(organizer, evt, vICAL.organizer, blank);
140         }
141     }
142 
143     private void addTextProps(HTMLDocument node, Resource evt) {
144         for (String date : textSingularProps) {
145             HTMLDocument.TextField val = node.getSingularTextField(date);
146             conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
147         }
148 
149         for (String date : textDateProps) {
150             HTMLDocument.TextField val = node.getSingularTextField(date);
151             try {
152                 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date),
153                         RDFUtils.getXSDDate(val.value(), DATE_FORMAT));
154             } catch (ParseException e) {
155                 // Unparsable date format just leave it as it is.
156                 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
157             } catch (DatatypeConfigurationException e) {
158                 // Unparsable date format just leave it as it is
159                 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
160             }
161         }
162 
163         HTMLDocument.TextField[] values = node.getPluralTextField("category");
164         for (TextField val : values) {
165             conditionallyAddStringProperty(val.source(), evt, vICAL.categories, val.value());
166         }
167     }
168 
169 }