View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.calendar;
19  
20  import biweekly.ICalDataType;
21  import biweekly.ICalVersion;
22  import biweekly.ICalendar;
23  import biweekly.component.ICalComponent;
24  import biweekly.component.VTimezone;
25  import biweekly.io.ParseWarning;
26  import biweekly.io.SkipMeException;
27  import biweekly.io.StreamReader;
28  import biweekly.io.TimezoneAssignment;
29  import biweekly.io.TimezoneInfo;
30  import biweekly.io.WriteContext;
31  import biweekly.io.json.JCalValue;
32  import biweekly.io.json.JsonValue;
33  import biweekly.io.scribe.ScribeIndex;
34  import biweekly.io.scribe.property.ICalPropertyScribe;
35  import biweekly.parameter.Encoding;
36  import biweekly.parameter.ICalParameters;
37  import biweekly.property.Geo;
38  import biweekly.property.ICalProperty;
39  import biweekly.util.DateTimeComponents;
40  import biweekly.util.ICalDateFormat;
41  import org.apache.any23.extractor.ExtractionContext;
42  import org.apache.any23.extractor.ExtractionException;
43  import org.apache.any23.extractor.ExtractionParameters;
44  import org.apache.any23.extractor.ExtractionResult;
45  import org.apache.any23.extractor.Extractor;
46  import org.apache.any23.extractor.IssueReport;
47  import org.apache.any23.vocab.ICAL;
48  import org.apache.commons.lang.StringUtils;
49  import org.eclipse.rdf4j.model.BNode;
50  import org.eclipse.rdf4j.model.IRI;
51  import org.eclipse.rdf4j.model.Value;
52  import org.eclipse.rdf4j.model.ValueFactory;
53  import org.eclipse.rdf4j.model.datatypes.XMLDatatypeUtil;
54  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
55  import org.eclipse.rdf4j.model.vocabulary.RDF;
56  import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
57  
58  import java.io.IOException;
59  import java.io.InputStream;
60  import java.io.PrintWriter;
61  import java.io.StringWriter;
62  import java.math.BigDecimal;
63  import java.math.BigInteger;
64  import java.util.Collection;
65  import java.util.List;
66  import java.util.Locale;
67  import java.util.Map;
68  import java.util.Objects;
69  import java.util.Set;
70  import java.util.TimeZone;
71  import java.util.regex.Matcher;
72  import java.util.regex.Pattern;
73  import java.util.stream.Collectors;
74  import java.util.stream.Stream;
75  
76  /**
77   * @author Hans Brende (hansbrende@apache.org)
78   */
79  abstract class BaseCalendarExtractor implements Extractor.ContentExtractor {
80  
81      @Override
82      public void setStopAtFirstError(boolean b) {
83          //unsupported
84      }
85  
86      private static final ValueFactory f = SimpleValueFactory.getInstance();
87      private static final ICAL vICAL = ICAL.getInstance();
88  
89      abstract StreamReader reader(InputStream inputStream);
90  
91      @Override
92      public final void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream inputStream,
93                            ExtractionResult result) throws IOException, ExtractionException {
94          result.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
95          result.writeNamespace(ICAL.PREFIX, ICAL.NS);
96          result.writeNamespace(XMLSchema.PREFIX, XMLSchema.NAMESPACE);
97  
98          ScribeIndex index = new ScribeIndex();
99          try (StreamReader reader = reader(inputStream)) {
100             ICalendar cal;
101             while ((cal = reader.readNext()) != null) {
102                 for (ParseWarning warning : reader.getWarnings()) {
103                     String message = warning.getMessage();
104                     Integer lineNumber = warning.getLineNumber();
105                     if (lineNumber == null) {
106                         result.notifyIssue(IssueReport.IssueLevel.WARNING, message, -1, -1);
107                     } else {
108                         result.notifyIssue(IssueReport.IssueLevel.WARNING, message, lineNumber, -1);
109                     }
110                 }
111 
112                 BNode calNode = f.createBNode();
113                 result.writeTriple(calNode, RDF.TYPE, vICAL.Vcalendar);
114                 WriteContext ctx = new WriteContext(ICalVersion.V2_0, cal.getTimezoneInfo(), null);
115                 extract(index, ctx, calNode, cal, result, true);
116             }
117         } catch (Exception e) {
118             result.notifyIssue(IssueReport.IssueLevel.FATAL, toString(e), -1, -1);
119         }
120     }
121 
122     private static String toString(Throwable th) {
123         StringWriter writer = new StringWriter();
124         try (PrintWriter pw = new PrintWriter(writer)) {
125             th.printStackTrace(pw);
126         }
127         String string = writer.toString();
128         if (string.length() > 200) {
129             return string.substring(0, 197) + "...";
130         }
131         return string;
132     }
133 
134 
135     private static String localNameOfType(String typeName) {
136         return camelCase(typeName, false);
137     }
138 
139     private static String localNameOfProperty(String propertyName) {
140         return camelCase(propertyName, true);
141     }
142 
143     private static String camelCase(String name, boolean forProperty) {
144         String[] nameComponents = name.toLowerCase(Locale.ENGLISH).split("-");
145         StringBuilder sb = new StringBuilder(name.length());
146         int i = 0;
147         if (forProperty) {
148             sb.append(nameComponents[i++]);
149         }
150         for (int len = nameComponents.length; i < len; i++) {
151             String n = nameComponents[i];
152             if (!n.isEmpty()) {
153                 int ind = Character.charCount(n.codePointAt(0));
154                 sb.append(n.substring(0, ind).toUpperCase(Locale.ENGLISH)).append(n.substring(ind));
155             }
156         }
157         return sb.toString();
158     }
159 
160     private static IRI type(String originalName) {
161         if (originalName.regionMatches(true, 0, "X-", 0, 2)) {
162             //non-standard class
163             return f.createIRI(ICAL.NS, "X-" + localNameOfType(originalName.substring(2)));
164         }
165 
166         String name = localNameOfType(originalName);
167 
168         try {
169             return Objects.requireNonNull(vICAL.getClass(name));
170         } catch (RuntimeException e) {
171             return null;
172         }
173     }
174 
175     private static IRI predicate(String originalName, ExtractionResult result) {
176         if (originalName.regionMatches(true, 0, "X-", 0, 2)) {
177             //non-standard property
178             return f.createIRI(ICAL.NS, "x-" + localNameOfProperty(originalName.substring(2)));
179         }
180 
181         String name = localNameOfProperty(originalName);
182 
183         try {
184             return Objects.requireNonNull(vICAL.getProperty(name));
185         } catch (RuntimeException e) {
186             IRI iri = f.createIRI(ICAL.NS, name);
187             result.notifyIssue(IssueReport.IssueLevel.ERROR,
188                     "property " + iri + " (" + originalName + ") not defined in " + ICAL.class.getName(),
189                     -1, -1);
190             return iri;
191         }
192     }
193 
194     private static final String NaN = Double.toString(Double.NaN);
195     private static String str(Double d) {
196         return d == null ? NaN : d.toString();
197     }
198 
199     private static BNode writeParams(BNode subject, IRI predicate, ICalParameters params, ExtractionResult result) {
200         BNode bNode = f.createBNode();
201         result.writeTriple(subject, predicate, bNode);
202         writeParams(bNode, params, result);
203         return bNode;
204     }
205 
206     private static void writeParams(BNode subject, ICalParameters params, ExtractionResult result) {
207         for (Map.Entry<String, List<String>> entry : params.getMap().entrySet()) {
208             List<String> strings = entry.getValue();
209             if (strings != null && !strings.isEmpty()) {
210                 IRI predicate = predicate(entry.getKey(), result);
211                 for (String v : strings) {
212                     result.writeTriple(subject, predicate, f.createLiteral(v));
213                 }
214             }
215         }
216     }
217 
218 
219     private static IRI dataType(ICalDataType dataType, Boolean isFloating) {
220         if (dataType == null || ICalDataType.TEXT.equals(dataType)) {
221             return XMLSchema.STRING;
222         } else if (ICalDataType.BOOLEAN.equals(dataType)) {
223             return XMLSchema.BOOLEAN;
224         } else if (ICalDataType.INTEGER.equals(dataType)) {
225             return XMLSchema.INTEGER;
226         } else if (ICalDataType.FLOAT.equals(dataType)) {
227             return XMLSchema.FLOAT;
228         } else if (ICalDataType.BINARY.equals(dataType)) {
229             return XMLSchema.BASE64BINARY;
230         } else if (ICalDataType.URI.equals(dataType)
231                 || ICalDataType.URL.equals(dataType)
232                 || ICalDataType.CONTENT_ID.equals(dataType)
233                 || ICalDataType.CAL_ADDRESS.equals(dataType)) {
234             return XMLSchema.ANYURI;
235         } else if (ICalDataType.DATE_TIME.equals(dataType)) {
236             if (isFloating == null) {
237                 return null;
238             }
239             return isFloating ? vICAL.DATE_TIME : XMLSchema.DATETIME;
240         } else if (ICalDataType.DATE.equals(dataType)) {
241             return XMLSchema.DATE;
242         } else if (ICalDataType.TIME.equals(dataType)) {
243             return XMLSchema.TIME;
244         } else if (ICalDataType.DURATION.equals(dataType)) {
245             return XMLSchema.DURATION;
246         } else if (ICalDataType.PERIOD.equals(dataType)) {
247             return vICAL.Value_PERIOD;
248         } else if (ICalDataType.RECUR.equals(dataType)) {
249             return vICAL.Value_RECUR;
250         } else {
251             return XMLSchema.STRING;
252         }
253     }
254 
255 
256     private static final Pattern durationWeeksPattern = Pattern.compile("(-?P)(\\d+)W");
257 
258     private static String normalizeAndReportIfInvalid(String s, IRI dataType, TimeZone zone, ExtractionResult result) {
259         if (dataType == null) {
260             return s;
261         }
262         try {
263             if (XMLSchema.DURATION.equals(dataType)) {
264                 Matcher m = durationWeeksPattern.matcher(s);
265                 if (m.matches()) {
266                     long days = Long.parseLong(m.group(2)) * 7;
267                     return m.group(1) + days + "D";
268                 }
269             } else if (vICAL.Value_PERIOD.equals(dataType)) {
270                 if (s.indexOf('/') == -1) {
271                     throw new IllegalArgumentException();
272                 }
273             } else if (zone != null && XMLSchema.DATETIME.equals(dataType)) {
274                 try {
275                     DateTimeComponents dt = DateTimeComponents.parse(s);
276                     if (!dt.isUtc()) {
277                         s = ICalDateFormat.DATE_TIME_EXTENDED.format(dt.toDate(zone), zone);
278                     }
279                 } catch (IllegalArgumentException e) {
280                     //ignore
281                 }
282             } else {
283                 s = XMLDatatypeUtil.normalize(s, dataType);
284             }
285 
286             if (!XMLDatatypeUtil.isValidValue(s, dataType)) {
287                 throw new IllegalArgumentException();
288             }
289         } catch (IllegalArgumentException e) {
290             String m = e.getMessage();
291             if (StringUtils.isBlank(m)) {
292                 m = "Not a valid " + dataType + " value: " + s;
293             }
294             result.notifyIssue(IssueReport.IssueLevel.ERROR, m, -1, -1);
295         }
296         return s;
297     }
298 
299     private static boolean writeValue(BNode subject, IRI predicate, JsonValue jsonValue, String lang, IRI dataType, TimeZone zone, ExtractionResult result) {
300         if (jsonValue == null || jsonValue.isNull()) {
301             return false;
302         }
303         Object val = jsonValue.getValue();
304         if (val != null) {
305             Value v;
306             if (val instanceof Byte) {
307                 v = f.createLiteral((byte)val);
308             } else if (val instanceof Short) {
309                 v = f.createLiteral((short)val);
310             } else if (val instanceof Integer) {
311                 v = f.createLiteral((int)val);
312             } else if (val instanceof Long) {
313                 v = f.createLiteral((long)val);
314             } else if (val instanceof Float) {
315                 v = f.createLiteral((float)val);
316             } else if (val instanceof Double) {
317                 v = f.createLiteral((double)val);
318             } else if (val instanceof Boolean) {
319                 v = f.createLiteral((boolean)val);
320             } else if (val instanceof BigInteger) {
321                 v = f.createLiteral((BigInteger)val);
322             } else if (val instanceof BigDecimal) {
323                 v = f.createLiteral((BigDecimal)val);
324             } else {
325                 String str = normalizeAndReportIfInvalid(val.toString(), dataType, zone, result);
326 
327                 if (XMLSchema.STRING.equals(dataType)) {
328                     if (lang == null) {
329                         v = f.createLiteral(str);
330                     } else {
331                         v = f.createLiteral(str, lang);
332                     }
333                 } else if (XMLSchema.ANYURI.equals(dataType)) {
334                     try {
335                         v = f.createIRI(str);
336                     } catch (IllegalArgumentException e) {
337                         v = f.createLiteral(str, dataType);
338                     }
339                 } else if (vICAL.Value_PERIOD.equals(dataType)) {
340                     String[] strs = str.split("/");
341                     if (strs.length == 2) {
342                         String firstPart = normalizeAndReportIfInvalid(strs[0], XMLSchema.DATETIME, zone, result);
343                         String secondPart = strs[1];
344                         if (secondPart.indexOf('P') != -1) { //duration
345                             secondPart = normalizeAndReportIfInvalid(secondPart, XMLSchema.DURATION, zone, result);
346                         } else {
347                             secondPart = normalizeAndReportIfInvalid(secondPart, XMLSchema.DATETIME, zone, result);
348                         }
349                         str = firstPart + "/" + secondPart;
350                     }
351                     v = f.createLiteral(str);
352                 } else if (dataType != null) {
353                     v = f.createLiteral(str, dataType);
354                 } else {
355                     v = f.createLiteral(str);
356                 }
357 
358             }
359             result.writeTriple(subject, predicate, v);
360             return true;
361         }
362 
363         List<JsonValue> array = jsonValue.getArray();
364         if (array != null && !array.isEmpty()) {
365             if (array.size() == 1) {
366                 return writeValue(subject, predicate, array.get(0), lang, dataType, zone, result);
367             } else {
368                 BNode bNode = f.createBNode();
369                 result.writeTriple(subject, predicate, bNode);
370                 for (JsonValue value : array) {
371                     writeValue(bNode, RDF.VALUE, value, lang, dataType, zone, result);
372                 }
373                 return true;
374             }
375         }
376 
377         Map<String, JsonValue> object = jsonValue.getObject();
378         if (object != null) {
379             BNode bNode = f.createBNode();
380             result.writeTriple(subject, predicate, bNode);
381             for (Map.Entry<String, JsonValue> entry : object.entrySet()) {
382                 writeValue(bNode, predicate(entry.getKey(), result), entry.getValue(), lang, XMLSchema.STRING, zone, result);
383             }
384             return true;
385         }
386 
387         return false;
388     }
389 
390     private static TimeZone parseTimeZoneId(String tzId) {
391         for (;;) {
392             TimeZone zone = ICalDateFormat.parseTimeZoneId(tzId);
393             if (zone != null) {
394                 return zone;
395             }
396             int ind = tzId.indexOf('/');
397             if (ind == -1) {
398                 return null;
399             }
400             tzId = tzId.substring(ind + 1);
401         }
402     }
403 
404     @SuppressWarnings("unchecked")
405     private static <T extends ICalProperty> void writeProperty(BNode subject, ICalPropertyScribe<T> scribe, ICalProperty property, WriteContext ctx, ExtractionResult result) {
406         try {
407             T prop = (T)property;
408 
409             ICalVersion version = ctx.getVersion();
410 
411             ICalDataType dataType = scribe.dataType(prop, version);
412 
413             ICalParameters params = scribe.prepareParameters(prop, ctx);
414 
415             String lang = params.getLanguage();
416             params.removeAll(ICalParameters.LANGUAGE);
417 
418             Encoding encoding = params.getEncoding();
419 
420             if (dataType == null) {
421                 dataType = params.getValue();
422                 if (dataType == null && Encoding.BASE64.equals(encoding)) {
423                     dataType = ICalDataType.BINARY;
424                 }
425             }
426             params.removeAll(ICalParameters.VALUE);
427 
428             if (ICalDataType.BINARY.equals(dataType)) {
429                 // RFC 5545 s. 3.2.7.
430                 // If the value type parameter is ";VALUE=BINARY", then the inline
431                 // encoding parameter MUST be specified with the value
432                 // ";ENCODING=BASE64"
433                 if (encoding != null && !Encoding.BASE64.equals(encoding)) {
434                     result.notifyIssue(IssueReport.IssueLevel.ERROR,
435                             "Invalid encoding " + encoding + " specified for BINARY value", -1, -1);
436                     dataType = null;
437                 }
438                 params.removeAll(ICalParameters.ENCODING);
439             }
440 
441             if (Encoding._8BIT.equals(encoding)) {
442                 // RFC 5545 s. 3.2.7.
443                 // The default encoding is "8BIT",
444                 // corresponding to a property value consisting of text.
445                 params.removeAll(ICalParameters.ENCODING);
446             }
447 
448             // RFC 5545 s. 3.1.4.
449             // There is not a property parameter to declare the charset used in a
450             //   property value.  The default charset for an iCalendar stream is UTF-8
451             //   as defined in [RFC3629].
452             params.removeAll(ICalParameters.CHARSET);
453 
454             IRI predicate = predicate(scribe.getPropertyName(version), result);
455 
456             if (ICalDataType.CAL_ADDRESS.equals(dataType)) {
457                 subject = writeParams(subject, predicate, params, result);
458                 predicate = vICAL.calAddress;
459             } else if (!params.isEmpty()) {
460                 subject = writeParams(subject, predicate, params, result);
461                 predicate = RDF.VALUE;
462             }
463 
464             if (prop instanceof Geo) {
465                 // RFC 5870
466                 Geo g = (Geo)prop;
467                 IRI value = f.createIRI("geo:" + str(g.getLatitude()) + "," + str(g.getLongitude()));
468                 result.writeTriple(subject, predicate, value);
469             } else {
470 
471                 String tzId = params.getTimezoneId();
472                 TimezoneInfo tzInfo = ctx.getTimezoneInfo();
473                 TimeZone timeZone = null;
474                 Boolean floating;
475                 if (tzId != null) {
476                     TimezoneAssignment assign = tzInfo.getTimezone(prop);
477                     if (assign != null) {
478                         timeZone = assign.getTimeZone();
479                     } else {
480                         timeZone = parseTimeZoneId(tzId);
481                         tzInfo.setFloating(prop, true);
482                     }
483                     floating = timeZone == null ? null : Boolean.FALSE;
484                 } else {
485                     floating = tzInfo.isFloating(prop);
486                 }
487 
488                 IRI dataTypeIRI = dataType(dataType, floating);
489 
490                 JCalValue jsonVal = scribe.writeJson(prop, ctx);
491                 List<JsonValue> jsonVals = jsonVal.getValues();
492 
493                 boolean mod = false;
494                 for (JsonValue value : jsonVals) {
495                     mod |= writeValue(subject, predicate, value, lang, dataTypeIRI, timeZone, result);
496                 }
497                 if (!mod) {
498                     result.writeTriple(subject, predicate, f.createLiteral(jsonVal.asSingle()));
499                 }
500             }
501         } catch (SkipMeException e) {
502             //ignore
503         }
504     }
505 
506     private static void extract(ScribeIndex index, WriteContext ctx, BNode node, ICalComponent component, ExtractionResult result, boolean writeTimezones) {
507         for (ICalProperty property : component.getProperties().values()) {
508             ctx.setParent(component);
509             writeProperty(node, index.getPropertyScribe(property), property, ctx, result);
510         }
511 
512         Stream<ICalComponent> components = component.getComponents().values().stream();
513 
514         if (writeTimezones) {
515             Collection<VTimezone> tzs = ctx.getTimezoneInfo().getComponents();
516             Set<String> tzIds = tzs.stream()
517                     .map(tz -> tz.getTimezoneId().getValue())
518                     .collect(Collectors.toSet());
519             components = Stream.concat(tzs.stream(), components.filter(c ->
520                     !(c instanceof VTimezone && tzIds.contains(((VTimezone) c).getTimezoneId().getValue())))
521             );
522         }
523 
524         components.forEachOrdered(child -> {
525             BNode childNode = f.createBNode();
526             String componentName = index.getComponentScribe(child).getComponentName();
527             IRI childType = type(componentName);
528 
529             if (childType == null) {
530                 result.writeTriple(node, predicate(componentName, result), childNode);
531             } else {
532                 result.writeTriple(node, vICAL.component, childNode);
533                 result.writeTriple(childNode, RDF.TYPE, childType);
534             }
535             extract(index, ctx, childNode, child, result, false);
536         });
537     }
538 
539 }
540