1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionException;
21 import org.apache.any23.extractor.ExtractorDescription;
22 import org.apache.any23.extractor.ExtractorFactory;
23 import org.apache.any23.extractor.SimpleExtractorFactory;
24 import org.apache.any23.extractor.TagSoupExtractionResult;
25 import org.apache.any23.rdf.PopularPrefixes;
26 import org.apache.any23.rdf.RDFUtils;
27 import org.apache.any23.vocab.ICAL;
28 import org.openrdf.model.BNode;
29 import org.openrdf.model.Resource;
30 import org.openrdf.model.URI;
31 import org.openrdf.model.vocabulary.RDF;
32 import org.w3c.dom.Node;
33
34 import javax.xml.datatype.DatatypeConfigurationException;
35 import java.text.ParseException;
36 import java.util.Arrays;
37 import java.util.List;
38
39 import static org.apache.any23.extractor.html.HTMLDocument.TextField;
40
41
42
43
44
45
46
47
48 public class HCalendarExtractor extends MicroformatExtractor {
49
50 private static final ICAL vICAL = ICAL.getInstance();
51
52 public final static ExtractorFactory<HCalendarExtractor> factory =
53 SimpleExtractorFactory.create(
54 "html-mf-hcalendar",
55 PopularPrefixes.createSubset("rdf", "ical"),
56 Arrays.asList("text/html;q=0.1", "application/xhtml+xml;q=0.1"),
57 "example-mf-hcalendar.html",
58 HCalendarExtractor.class);
59
60 private static final String[] Components = {"Vevent", "Vtodo", "Vjournal", "Vfreebusy"};
61
62 private static final String DATE_FORMAT = "yyyyMMdd'T'HHmm'Z'";
63
64 private String[] textSingularProps = {
65 "summary",
66 "class",
67 "transp",
68 "description",
69 "status",
70 "location"};
71
72 private String[] textDateProps = {
73 "dtstart",
74 "dtstamp",
75 "dtend",
76 };
77
78 public ExtractorDescription getDescription() {
79 return factory;
80 }
81
82 @Override
83 protected boolean extract() throws ExtractionException {
84 final HTMLDocument document = getHTMLDocument();
85 List<Node> calendars = document.findAllByClassName("vcalendar");
86 if (calendars.size() == 0)
87
88
89 if (document.findAllByClassName("vevent").size() > 0)
90 calendars.add(document.getDocument());
91
92 boolean foundAny = false;
93 for (Node node : calendars)
94 foundAny |= extractCalendar(node);
95
96 return foundAny;
97 }
98
99 private boolean extractCalendar(Node node) throws ExtractionException {
100 URI cal = getDocumentURI();
101 addURIProperty(cal, RDF.TYPE, vICAL.Vcalendar);
102 return addComponents(node, cal);
103 }
104
105 private boolean addComponents(Node node, Resource cal) throws ExtractionException {
106 boolean foundAny = false;
107 for (String component : Components) {
108 List<Node> events = DomUtils.findAllByClassName(node, component);
109 if (events.size() == 0)
110 continue;
111 for (Node evtNode : events)
112 foundAny |= extractComponent(evtNode, cal, component);
113 }
114 return foundAny;
115 }
116
117 private boolean extractComponent(Node node, Resource cal, String component) throws ExtractionException {
118 HTMLDocument compoNode = new HTMLDocument(node);
119 BNode evt = valueFactory.createBNode();
120 addURIProperty(evt, RDF.TYPE, vICAL.getClass(component));
121 addTextProps(compoNode, evt);
122 addUrl(compoNode, evt);
123 addRRule(compoNode, evt);
124 addOrganizer(compoNode, evt);
125 addUid(compoNode, evt);
126 addBNodeProperty(cal, vICAL.component, evt);
127
128 final TagSoupExtractionResult tser = (TagSoupExtractionResult) getCurrentExtractionResult();
129 tser.addResourceRoot( compoNode.getPathToLocalRoot(), evt, this.getClass() );
130
131 return true;
132 }
133
134 private void addUid(HTMLDocument compoNode, Resource evt) {
135 TextField url = compoNode.getSingularUrlField("uid");
136 conditionallyAddStringProperty(
137 compoNode.getDocument(),
138 evt, vICAL.uid, url.value()
139 );
140 }
141
142 private void addUrl(HTMLDocument compoNode, Resource evt) throws ExtractionException {
143 TextField url = compoNode.getSingularUrlField("url");
144 if ("".equals(url.value())) return;
145 addURIProperty(evt, vICAL.url, getHTMLDocument().resolveURI(url.value()));
146 }
147
148 private void addRRule(HTMLDocument compoNode, Resource evt) {
149 for (Node rule : compoNode.findAllByClassName("rrule")) {
150 BNode rrule = valueFactory.createBNode();
151 addURIProperty(rrule, RDF.TYPE, vICAL.DomainOf_rrule);
152 TextField freq = new HTMLDocument(rule).getSingularTextField("freq");
153 conditionallyAddStringProperty(
154 freq.source(),
155 rrule, vICAL.freq, freq.value()
156 );
157 addBNodeProperty(
158 rule,
159 evt, vICAL.rrule, rrule
160 );
161 }
162 }
163
164 private void addOrganizer(HTMLDocument compoNode, Resource evt) {
165 for (Node organizer : compoNode.findAllByClassName("organizer")) {
166
167 BNode blank = valueFactory.createBNode();
168 TextField mail = new HTMLDocument(organizer).getSingularUrlField("organizer");
169 conditionallyAddStringProperty(
170 compoNode.getDocument(),
171 blank, vICAL.calAddress, mail.value()
172 );
173 addBNodeProperty(
174 organizer,
175 evt, vICAL.organizer, blank
176 );
177 }
178 }
179
180 private void addTextProps(HTMLDocument node, Resource evt) {
181 for (String date : textSingularProps) {
182 HTMLDocument.TextField val = node.getSingularTextField(date);
183 conditionallyAddStringProperty(
184 val.source(),
185 evt, vICAL.getProperty(date), val.value()
186 );
187 }
188
189 for (String date : textDateProps) {
190 HTMLDocument.TextField val = node.getSingularTextField(date);
191 try {
192 conditionallyAddStringProperty(
193 val.source(),
194 evt,
195 vICAL.getProperty(date),
196 RDFUtils.getXSDDate(
197 val.value(),
198 DATE_FORMAT
199 )
200 );
201 } catch (ParseException e) {
202
203 conditionallyAddStringProperty( val.source(), evt, vICAL.getProperty(date), val.value());
204 } catch (DatatypeConfigurationException e) {
205
206 conditionallyAddStringProperty(val.source(), evt, vICAL.getProperty(date), val.value());
207 }
208 }
209
210 HTMLDocument.TextField[] values = node.getPluralTextField("category");
211 for (TextField val : values) {
212 conditionallyAddStringProperty(val.source(), evt, vICAL.categories, val.value());
213 }
214 }
215
216 }