View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractorDescription;
22  import org.apache.any23.extractor.TagSoupExtractionResult;
23  import org.apache.any23.rdf.RDFUtils;
24  import org.apache.any23.vocab.ICAL;
25  import org.eclipse.rdf4j.model.BNode;
26  import org.eclipse.rdf4j.model.Resource;
27  import org.eclipse.rdf4j.model.IRI;
28  import org.eclipse.rdf4j.model.vocabulary.RDF;
29  import org.w3c.dom.Node;
30  
31  import javax.xml.datatype.DatatypeConfigurationException;
32  import java.text.ParseException;
33  import java.util.List;
34  
35  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
36  
37  
38  /**
39   * Extractor for the <a href="http://microformats.org/wiki/hcalendar">hCalendar</a>
40   * microformat.
41   *
42   * @author Gabriele Renzi
43   */
44  public class HCalendarExtractor extends MicroformatExtractor {
45  
46      private static final ICAL vICAL = ICAL.getInstance();
47  
48      private static final String[] Components = {"Vevent", "Vtodo", "Vjournal", "Vfreebusy"};
49  
50      private static final String DATE_FORMAT = "yyyyMMdd'T'HHmm'Z'";
51  
52      private String[] textSingularProps = {
53              "summary",
54              "class",
55              "transp",
56              "description",
57              "status",
58              "location"};
59  
60      private String[] textDateProps = {
61              "dtstart",
62              "dtstamp",
63              "dtend",
64      };
65  
66      @Override
67      public ExtractorDescription getDescription() {
68          return HCalendarExtractorFactory.getDescriptionInstance();
69      }
70  
71      @Override
72      protected boolean extract() throws ExtractionException {
73          final HTMLDocument document = getHTMLDocument();
74          List<Node> calendars = document.findAllByClassName("vcalendar");
75          if (calendars.size() == 0)
76              // vcal allows to avoid top name, in which case whole document is
77              // the calendar, let's try
78              if (document.findAllByClassName("vevent").size() > 0)
79                  calendars.add(document.getDocument());
80  
81          boolean foundAny = false;
82          for (Node node : calendars)
83              foundAny |= extractCalendar(node);
84  
85          return foundAny;
86      }
87  
88      private boolean extractCalendar(Node node) throws ExtractionException {
89          IRI cal = getDocumentIRI();
90          addIRIProperty(cal, RDF.TYPE, vICAL.Vcalendar);
91          return addComponents(node, cal);
92      }
93  
94      private boolean addComponents(Node node, Resource cal) throws ExtractionException {
95          boolean foundAny = false;
96          for (String component : Components) {
97              List<Node> events = DomUtils.findAllByClassName(node, component);
98              if (events.size() == 0)
99                  continue;
100             for (Node evtNode : events)
101                 foundAny |= extractComponent(evtNode, cal, component);
102         }
103         return foundAny;
104     }
105 
106     private boolean extractComponent(Node node, Resource cal, String component) throws ExtractionException {
107         HTMLDocumentMLDocument.html#HTMLDocument">HTMLDocument compoNode = new HTMLDocument(node);
108         BNode evt = valueFactory.createBNode();
109         addIRIProperty(evt, RDF.TYPE, vICAL.getClass(component));
110         addTextProps(compoNode, evt);
111         addUrl(compoNode, evt);
112         addRRule(compoNode, evt);
113         addOrganizer(compoNode, evt);
114         addUid(compoNode, evt);
115         addBNodeProperty(cal, vICAL.component, evt);
116 
117         final TagSoupExtractionResultorg/apache/any23/extractor/TagSoupExtractionResult.html#TagSoupExtractionResult">TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
118         tser.addResourceRoot( compoNode.getPathToLocalRoot(), evt, this.getClass() );
119 
120         return true;
121     }
122 
123     private void addUid(HTMLDocument compoNode, Resource evt) {
124         TextField url = compoNode.getSingularUrlField("uid");
125         conditionallyAddStringProperty(
126                 compoNode.getDocument(),
127                 evt, vICAL.uid, url.value()
128         );
129     }
130 
131     private void addUrl(HTMLDocument compoNode, Resource evt) throws ExtractionException {
132         TextField url = compoNode.getSingularUrlField("url");
133         if ("".equals(url.value())) return;
134         addIRIProperty(evt, vICAL.url, getHTMLDocument().resolveIRI(url.value()));
135     }
136 
137     private void addRRule(HTMLDocument compoNode, Resource evt) {
138         for (Node rule : compoNode.findAllByClassName("rrule")) {
139             BNode rrule = valueFactory.createBNode();
140             addIRIProperty(rrule, RDF.TYPE, vICAL.DomainOf_rrule);
141             TextField freq = new HTMLDocument(rule).getSingularTextField("freq");
142             conditionallyAddStringProperty(
143                     freq.source(),
144                     rrule, vICAL.freq, freq.value()
145             );
146             addBNodeProperty(
147                     rule,
148                     evt, vICAL.rrule, rrule
149             );
150         }
151     }
152 
153     private void addOrganizer(HTMLDocument compoNode, Resource evt) {
154         for (Node organizer : compoNode.findAllByClassName("organizer")) {
155             //untyped
156             BNode blank = valueFactory.createBNode();
157             TextField mail = new HTMLDocument(organizer).getSingularUrlField("organizer");
158             conditionallyAddStringProperty(
159                     compoNode.getDocument(),
160                     blank, vICAL.calAddress, mail.value()
161             );
162             addBNodeProperty(
163                     organizer,
164                     evt, vICAL.organizer, blank
165             );
166         }
167     }
168 
169     private void addTextProps(HTMLDocument node, Resource evt) {
170         for (String date : textSingularProps) {
171             HTMLDocument.TextField val = node.getSingularTextField(date);
172             conditionallyAddStringProperty(
173                     val.source(),
174                     evt, vICAL.getProperty(date), val.value()
175             );
176         }
177 
178         for (String date : textDateProps) {
179             HTMLDocument.TextField val = node.getSingularTextField(date);
180             try {
181                 conditionallyAddStringProperty(
182                         val.source(),
183                         evt,
184                         vICAL.getProperty(date),
185                         RDFUtils.getXSDDate(
186                                 val.value(),
187                                 DATE_FORMAT
188                         )
189                 );
190             } catch (ParseException e) {
191                 // Unparsable date format just leave it as it is.
192                 conditionallyAddStringProperty( val.source(), evt, vICAL.getProperty(date), val.value());
193             } catch (DatatypeConfigurationException e) {
194                 // Unparsable date format just leave it as it is
195                 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
196             }
197         }
198 
199         HTMLDocument.TextField[] values = node.getPluralTextField("category");
200         for (TextField val : values) {
201             conditionallyAddStringProperty(val.source(), evt, vICAL.categories, val.value());
202         }
203     }
204 
205 }