View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.calendar;
19  
20  import biweekly.ICalDataType;
21  import biweekly.ICalVersion;
22  import biweekly.ICalendar;
23  import biweekly.component.ICalComponent;
24  import biweekly.component.VTimezone;
25  import biweekly.io.ParseWarning;
26  import biweekly.io.SkipMeException;
27  import biweekly.io.StreamReader;
28  import biweekly.io.TimezoneAssignment;
29  import biweekly.io.TimezoneInfo;
30  import biweekly.io.WriteContext;
31  import biweekly.io.json.JCalValue;
32  import biweekly.io.json.JsonValue;
33  import biweekly.io.scribe.ScribeIndex;
34  import biweekly.io.scribe.property.ICalPropertyScribe;
35  import biweekly.parameter.Encoding;
36  import biweekly.parameter.ICalParameters;
37  import biweekly.property.Geo;
38  import biweekly.property.ICalProperty;
39  import biweekly.util.DateTimeComponents;
40  import biweekly.util.ICalDateFormat;
41  import org.apache.any23.extractor.ExtractionContext;
42  import org.apache.any23.extractor.ExtractionException;
43  import org.apache.any23.extractor.ExtractionParameters;
44  import org.apache.any23.extractor.ExtractionResult;
45  import org.apache.any23.extractor.Extractor;
46  import org.apache.any23.extractor.IssueReport;
47  import org.apache.any23.vocab.ICAL;
48  import org.apache.commons.lang3.StringUtils;
49  import org.eclipse.rdf4j.model.BNode;
50  import org.eclipse.rdf4j.model.IRI;
51  import org.eclipse.rdf4j.model.Value;
52  import org.eclipse.rdf4j.model.ValueFactory;
53  import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil;
54  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
55  import org.eclipse.rdf4j.model.vocabulary.RDF;
56  import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
57  
58  import java.io.IOException;
59  import java.io.InputStream;
60  import java.io.PrintWriter;
61  import java.io.StringWriter;
62  import java.math.BigDecimal;
63  import java.math.BigInteger;
64  import java.util.Collection;
65  import java.util.List;
66  import java.util.Locale;
67  import java.util.Map;
68  import java.util.Objects;
69  import java.util.Set;
70  import java.util.TimeZone;
71  import java.util.regex.Matcher;
72  import java.util.regex.Pattern;
73  import java.util.stream.Collectors;
74  import java.util.stream.Stream;
75  
76  /**
77   * @author Hans Brende (hansbrende@apache.org)
78   */
79  abstract class BaseCalendarExtractor implements Extractor.ContentExtractor {
80  
81      @Override
82      public void setStopAtFirstError(boolean b) {
83          // unsupported
84      }
85  
86      private static final ValueFactory f = SimpleValueFactory.getInstance();
87      private static final ICAL vICAL = ICAL.getInstance();
88  
89      abstract StreamReader reader(InputStream inputStream);
90  
91      @Override
92      public final void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext,
93              InputStream inputStream, ExtractionResult result) throws IOException, ExtractionException {
94          result.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
95          result.writeNamespace(ICAL.PREFIX, ICAL.NS);
96          result.writeNamespace(XMLSchema.PREFIX, XMLSchema.NAMESPACE);
97  
98          ScribeIndex index = new ScribeIndex();
99          try (StreamReader reader = reader(inputStream)) {
100             ICalendar cal;
101             while ((cal = reader.readNext()) != null) {
102                 for (ParseWarning warning : reader.getWarnings()) {
103                     String message = warning.getMessage();
104                     Integer lineNumber = warning.getLineNumber();
105                     if (lineNumber == null) {
106                         result.notifyIssue(IssueReport.IssueLevel.WARNING, message, -1, -1);
107                     } else {
108                         result.notifyIssue(IssueReport.IssueLevel.WARNING, message, lineNumber, -1);
109                     }
110                 }
111 
112                 BNode calNode = f.createBNode();
113                 result.writeTriple(calNode, RDF.TYPE, vICAL.Vcalendar);
114                 WriteContext ctx = new WriteContext(ICalVersion.V2_0, cal.getTimezoneInfo(), null);
115                 extract(index, ctx, calNode, cal, result, true);
116             }
117         } catch (Exception e) {
118             result.notifyIssue(IssueReport.IssueLevel.FATAL, toString(e), -1, -1);
119         }
120     }
121 
122     private static String toString(Throwable th) {
123         StringWriter writer = new StringWriter();
124         try (PrintWriter pw = new PrintWriter(writer)) {
125             th.printStackTrace(pw);
126         }
127         String string = writer.toString();
128         if (string.length() > 200) {
129             return string.substring(0, 197) + "...";
130         }
131         return string;
132     }
133 
134     private static String localNameOfType(String typeName) {
135         return camelCase(typeName, false);
136     }
137 
138     private static String localNameOfProperty(String propertyName) {
139         return camelCase(propertyName, true);
140     }
141 
142     private static String camelCase(String name, boolean forProperty) {
143         String[] nameComponents = name.toLowerCase(Locale.ENGLISH).split("-");
144         StringBuilder sb = new StringBuilder(name.length());
145         int i = 0;
146         if (forProperty) {
147             sb.append(nameComponents[i++]);
148         }
149         for (int len = nameComponents.length; i < len; i++) {
150             String n = nameComponents[i];
151             if (!n.isEmpty()) {
152                 int ind = Character.charCount(n.codePointAt(0));
153                 sb.append(n.substring(0, ind).toUpperCase(Locale.ENGLISH)).append(n.substring(ind));
154             }
155         }
156         return sb.toString();
157     }
158 
159     private static IRI type(String originalName) {
160         if (originalName.regionMatches(true, 0, "X-", 0, 2)) {
161             // non-standard class
162             return f.createIRI(ICAL.NS, "X-" + localNameOfType(originalName.substring(2)));
163         }
164 
165         String name = localNameOfType(originalName);
166 
167         try {
168             return Objects.requireNonNull(vICAL.getClass(name));
169         } catch (RuntimeException e) {
170             return null;
171         }
172     }
173 
174     private static IRI predicate(String originalName, ExtractionResult result) {
175         if (originalName.regionMatches(true, 0, "X-", 0, 2)) {
176             // non-standard property
177             return f.createIRI(ICAL.NS, "x-" + localNameOfProperty(originalName.substring(2)));
178         }
179 
180         String name = localNameOfProperty(originalName);
181 
182         try {
183             return Objects.requireNonNull(vICAL.getProperty(name));
184         } catch (RuntimeException e) {
185             IRI iri = f.createIRI(ICAL.NS, name);
186             result.notifyIssue(IssueReport.IssueLevel.ERROR,
187                     "property " + iri + " (" + originalName + ") not defined in " + ICAL.class.getName(), -1, -1);
188             return iri;
189         }
190     }
191 
192     private static final String NaN = Double.toString(Double.NaN);
193 
194     private static String str(Double d) {
195         return d == null ? NaN : d.toString();
196     }
197 
198     private static BNode writeParams(BNode subject, IRI predicate, ICalParameters params, ExtractionResult result) {
199         BNode bNode = f.createBNode();
200         result.writeTriple(subject, predicate, bNode);
201         writeParams(bNode, params, result);
202         return bNode;
203     }
204 
205     private static void writeParams(BNode subject, ICalParameters params, ExtractionResult result) {
206         for (Map.Entry<String, List<String>> entry : params.getMap().entrySet()) {
207             List<String> strings = entry.getValue();
208             if (strings != null && !strings.isEmpty()) {
209                 IRI predicate = predicate(entry.getKey(), result);
210                 for (String v : strings) {
211                     result.writeTriple(subject, predicate, f.createLiteral(v));
212                 }
213             }
214         }
215     }
216 
217     private static IRI dataType(ICalDataType dataType, Boolean isFloating) {
218         if (dataType == null || ICalDataType.TEXT.equals(dataType)) {
219             return XMLSchema.STRING;
220         } else if (ICalDataType.BOOLEAN.equals(dataType)) {
221             return XMLSchema.BOOLEAN;
222         } else if (ICalDataType.INTEGER.equals(dataType)) {
223             return XMLSchema.INTEGER;
224         } else if (ICalDataType.FLOAT.equals(dataType)) {
225             return XMLSchema.FLOAT;
226         } else if (ICalDataType.BINARY.equals(dataType)) {
227             return XMLSchema.BASE64BINARY;
228         } else if (ICalDataType.URI.equals(dataType) || ICalDataType.URL.equals(dataType)
229                 || ICalDataType.CONTENT_ID.equals(dataType) || ICalDataType.CAL_ADDRESS.equals(dataType)) {
230             return XMLSchema.ANYURI;
231         } else if (ICalDataType.DATE_TIME.equals(dataType)) {
232             if (isFloating == null) {
233                 return null;
234             }
235             return isFloating ? vICAL.DATE_TIME : XMLSchema.DATETIME;
236         } else if (ICalDataType.DATE.equals(dataType)) {
237             return XMLSchema.DATE;
238         } else if (ICalDataType.TIME.equals(dataType)) {
239             return XMLSchema.TIME;
240         } else if (ICalDataType.DURATION.equals(dataType)) {
241             return XMLSchema.DURATION;
242         } else if (ICalDataType.PERIOD.equals(dataType)) {
243             return vICAL.Value_PERIOD;
244         } else if (ICalDataType.RECUR.equals(dataType)) {
245             return vICAL.Value_RECUR;
246         } else {
247             return XMLSchema.STRING;
248         }
249     }
250 
251     private static final Pattern durationWeeksPattern = Pattern.compile("(-?P)(\\d+)W");
252 
253     private static String normalizeAndReportIfInvalid(String s, IRI dataType, TimeZone zone, ExtractionResult result) {
254         if (dataType == null) {
255             return s;
256         }
257         try {
258             if (XMLSchema.DURATION.equals(dataType)) {
259                 Matcher m = durationWeeksPattern.matcher(s);
260                 if (m.matches()) {
261                     long days = Long.parseLong(m.group(2)) * 7;
262                     return m.group(1) + days + "D";
263                 }
264             } else if (vICAL.Value_PERIOD.equals(dataType)) {
265                 if (s.indexOf('/') == -1) {
266                     throw new IllegalArgumentException();
267                 }
268             } else if (zone != null && XMLSchema.DATETIME.equals(dataType)) {
269                 try {
270                     DateTimeComponents dt = DateTimeComponents.parse(s);
271                     if (!dt.isUtc()) {
272                         s = ICalDateFormat.DATE_TIME_EXTENDED.format(dt.toDate(zone), zone);
273                     }
274                 } catch (IllegalArgumentException e) {
275                     // ignore
276                 }
277             } else {
278                 s = XMLDatatypeUtil.normalize(s, dataType);
279             }
280 
281             if (!XMLDatatypeUtil.isValidValue(s, dataType)) {
282                 throw new IllegalArgumentException();
283             }
284         } catch (IllegalArgumentException e) {
285             String m = e.getMessage();
286             if (StringUtils.isBlank(m)) {
287                 m = "Not a valid " + dataType + " value: " + s;
288             }
289             result.notifyIssue(IssueReport.IssueLevel.ERROR, m, -1, -1);
290         }
291         return s;
292     }
293 
294     private static boolean writeValue(BNode subject, IRI predicate, JsonValue jsonValue, String lang, IRI dataType,
295             TimeZone zone, ExtractionResult result) {
296         if (jsonValue == null || jsonValue.isNull()) {
297             return false;
298         }
299         Object val = jsonValue.getValue();
300         if (val != null) {
301             Value v;
302             if (val instanceof Byte) {
303                 v = f.createLiteral((byte) val);
304             } else if (val instanceof Short) {
305                 v = f.createLiteral((short) val);
306             } else if (val instanceof Integer) {
307                 v = f.createLiteral((int) val);
308             } else if (val instanceof Long) {
309                 v = f.createLiteral((long) val);
310             } else if (val instanceof Float) {
311                 v = f.createLiteral((float) val);
312             } else if (val instanceof Double) {
313                 v = f.createLiteral((double) val);
314             } else if (val instanceof Boolean) {
315                 v = f.createLiteral((boolean) val);
316             } else if (val instanceof BigInteger) {
317                 v = f.createLiteral((BigInteger) val);
318             } else if (val instanceof BigDecimal) {
319                 v = f.createLiteral((BigDecimal) val);
320             } else {
321                 String str = normalizeAndReportIfInvalid(val.toString(), dataType, zone, result);
322 
323                 if (XMLSchema.STRING.equals(dataType)) {
324                     if (lang == null) {
325                         v = f.createLiteral(str);
326                     } else {
327                         v = f.createLiteral(str, lang);
328                     }
329                 } else if (XMLSchema.ANYURI.equals(dataType)) {
330                     try {
331                         v = f.createIRI(str);
332                     } catch (IllegalArgumentException e) {
333                         v = f.createLiteral(str, dataType);
334                     }
335                 } else if (vICAL.Value_PERIOD.equals(dataType)) {
336                     String[] strs = str.split("/");
337                     if (strs.length == 2) {
338                         String firstPart = normalizeAndReportIfInvalid(strs[0], XMLSchema.DATETIME, zone, result);
339                         String secondPart = strs[1];
340                         if (secondPart.indexOf('P') != -1) { // duration
341                             secondPart = normalizeAndReportIfInvalid(secondPart, XMLSchema.DURATION, zone, result);
342                         } else {
343                             secondPart = normalizeAndReportIfInvalid(secondPart, XMLSchema.DATETIME, zone, result);
344                         }
345                         str = firstPart + "/" + secondPart;
346                     }
347                     v = f.createLiteral(str);
348                 } else if (dataType != null) {
349                     v = f.createLiteral(str, dataType);
350                 } else {
351                     v = f.createLiteral(str);
352                 }
353 
354             }
355             result.writeTriple(subject, predicate, v);
356             return true;
357         }
358 
359         List<JsonValue> array = jsonValue.getArray();
360         if (array != null && !array.isEmpty()) {
361             if (array.size() == 1) {
362                 return writeValue(subject, predicate, array.get(0), lang, dataType, zone, result);
363             } else {
364                 BNode bNode = f.createBNode();
365                 result.writeTriple(subject, predicate, bNode);
366                 for (JsonValue value : array) {
367                     writeValue(bNode, RDF.VALUE, value, lang, dataType, zone, result);
368                 }
369                 return true;
370             }
371         }
372 
373         Map<String, JsonValue> object = jsonValue.getObject();
374         if (object != null) {
375             BNode bNode = f.createBNode();
376             result.writeTriple(subject, predicate, bNode);
377             for (Map.Entry<String, JsonValue> entry : object.entrySet()) {
378                 writeValue(bNode, predicate(entry.getKey(), result), entry.getValue(), lang, XMLSchema.STRING, zone,
379                         result);
380             }
381             return true;
382         }
383 
384         return false;
385     }
386 
387     private static TimeZone parseTimeZoneId(String tzId) {
388         for (;;) {
389             TimeZone zone = ICalDateFormat.parseTimeZoneId(tzId);
390             if (zone != null) {
391                 return zone;
392             }
393             int ind = tzId.indexOf('/');
394             if (ind == -1) {
395                 return null;
396             }
397             tzId = tzId.substring(ind + 1);
398         }
399     }
400 
401     @SuppressWarnings("unchecked")
402     private static <T extends ICalProperty> void writeProperty(BNode subject, ICalPropertyScribe<T> scribe,
403             ICalProperty property, WriteContext ctx, ExtractionResult result) {
404         try {
405             T prop = (T) property;
406 
407             ICalVersion version = ctx.getVersion();
408 
409             ICalDataType dataType = scribe.dataType(prop, version);
410 
411             ICalParameters params = scribe.prepareParameters(prop, ctx);
412 
413             String lang = params.getLanguage();
414             params.removeAll(ICalParameters.LANGUAGE);
415 
416             Encoding encoding = params.getEncoding();
417 
418             if (dataType == null) {
419                 dataType = params.getValue();
420                 if (dataType == null && Encoding.BASE64.equals(encoding)) {
421                     dataType = ICalDataType.BINARY;
422                 }
423             }
424             params.removeAll(ICalParameters.VALUE);
425 
426             if (ICalDataType.BINARY.equals(dataType)) {
427                 // RFC 5545 s. 3.2.7.
428                 // If the value type parameter is ";VALUE=BINARY", then the inline
429                 // encoding parameter MUST be specified with the value
430                 // ";ENCODING=BASE64"
431                 if (encoding != null && !Encoding.BASE64.equals(encoding)) {
432                     result.notifyIssue(IssueReport.IssueLevel.ERROR,
433                             "Invalid encoding " + encoding + " specified for BINARY value", -1, -1);
434                     dataType = null;
435                 }
436                 params.removeAll(ICalParameters.ENCODING);
437             }
438 
439             if (Encoding._8BIT.equals(encoding)) {
440                 // RFC 5545 s. 3.2.7.
441                 // The default encoding is "8BIT",
442                 // corresponding to a property value consisting of text.
443                 params.removeAll(ICalParameters.ENCODING);
444             }
445 
446             // RFC 5545 s. 3.1.4.
447             // There is not a property parameter to declare the charset used in a
448             // property value. The default charset for an iCalendar stream is UTF-8
449             // as defined in [RFC3629].
450             params.removeAll(ICalParameters.CHARSET);
451 
452             IRI predicate = predicate(scribe.getPropertyName(version), result);
453 
454             if (ICalDataType.CAL_ADDRESS.equals(dataType)) {
455                 subject = writeParams(subject, predicate, params, result);
456                 predicate = vICAL.calAddress;
457             } else if (!params.isEmpty()) {
458                 subject = writeParams(subject, predicate, params, result);
459                 predicate = RDF.VALUE;
460             }
461 
462             if (prop instanceof Geo) {
463                 // RFC 5870
464                 Geo g = (Geo) prop;
465                 IRI value = f.createIRI("geo:" + str(g.getLatitude()) + "," + str(g.getLongitude()));
466                 result.writeTriple(subject, predicate, value);
467             } else {
468 
469                 String tzId = params.getTimezoneId();
470                 TimezoneInfo tzInfo = ctx.getTimezoneInfo();
471                 TimeZone timeZone = null;
472                 Boolean floating;
473                 if (tzId != null) {
474                     TimezoneAssignment assign = tzInfo.getTimezone(prop);
475                     if (assign != null) {
476                         timeZone = assign.getTimeZone();
477                     } else {
478                         timeZone = parseTimeZoneId(tzId);
479                         tzInfo.setFloating(prop, true);
480                     }
481                     floating = timeZone == null ? null : Boolean.FALSE;
482                 } else {
483                     floating = tzInfo.isFloating(prop);
484                 }
485 
486                 IRI dataTypeIRI = dataType(dataType, floating);
487 
488                 JCalValue jsonVal = scribe.writeJson(prop, ctx);
489                 List<JsonValue> jsonVals = jsonVal.getValues();
490 
491                 boolean mod = false;
492                 for (JsonValue value : jsonVals) {
493                     mod |= writeValue(subject, predicate, value, lang, dataTypeIRI, timeZone, result);
494                 }
495                 if (!mod) {
496                     result.writeTriple(subject, predicate, f.createLiteral(jsonVal.asSingle()));
497                 }
498             }
499         } catch (SkipMeException e) {
500             // ignore
501         }
502     }
503 
504     private static void extract(ScribeIndex index, WriteContext ctx, BNode node, ICalComponent component,
505             ExtractionResult result, boolean writeTimezones) {
506         for (ICalProperty property : component.getProperties().values()) {
507             ctx.setParent(component);
508             writeProperty(node, index.getPropertyScribe(property), property, ctx, result);
509         }
510 
511         Stream<ICalComponent> components = component.getComponents().values().stream();
512 
513         if (writeTimezones) {
514             Collection<VTimezone> tzs = ctx.getTimezoneInfo().getComponents();
515             Set<String> tzIds = tzs.stream().map(tz -> tz.getTimezoneId().getValue()).collect(Collectors.toSet());
516             components = Stream.concat(tzs.stream(), components.filter(
517                     c -> !(c instanceof VTimezone && tzIds.contains(((VTimezone) c).getTimezoneId().getValue()))));
518         }
519 
520         components.forEachOrdered(child -> {
521             BNode childNode = f.createBNode();
522             String componentName = index.getComponentScribe(child).getComponentName();
523             IRI childType = type(componentName);
524 
525             if (childType == null) {
526                 result.writeTriple(node, predicate(componentName, result), childNode);
527             } else {
528                 result.writeTriple(node, vICAL.component, childNode);
529                 result.writeTriple(childNode, RDF.TYPE, childType);
530             }
531             extract(index, ctx, childNode, child, result, false);
532         });
533     }
534 
535 }