View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.csv;
19  
20  import org.apache.any23.configuration.DefaultConfiguration;
21  import org.apache.commons.csv.CSVParser;
22  import org.apache.commons.csv.CSVFormat;
23  import org.apache.commons.csv.CSVRecord;
24  
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.io.InputStreamReader;
28  import java.util.Iterator;
29  
30  /**
31   * This class is responsible to build a reader first guessing the configuration
32   * from the file it self and then, if not successful, from the {@link org.apache.any23.configuration.DefaultConfiguration}.
33   *
34   * @author Davide Palmisano ( dpalmisano@gmail.com )
35   * @author Michele Mostarda ( michele.mostarda@gmail.com )
36   */
37  public class CSVReaderBuilder {
38  
39      private static final String DEFAULT_FIELD_DELIMITER = ",";
40  
41      private static final String DEFAULT_COMMENT_DELIMITER = "#";
42  
43      private static final char[] popularDelimiters = {'\t', '|', ',', ';'};
44  
45      private static DefaultConfiguration defaultConfiguration =
46              DefaultConfiguration.singleton();
47  
48      private static final CSVFormat[] strategies;
49  
50      static {
51          strategies = new CSVFormat[popularDelimiters.length + 1];
52          strategies[0] = CSVFormat.DEFAULT;
53          int index = 1;
54          for (char dlmt : popularDelimiters) {
55              strategies[index++] = CSVFormat.DEFAULT.withDelimiter(dlmt);
56          }
57      }
58  
59      /**
60       * Builds a not <code>null</code> {@link org.apache.commons.csv.CSVParser} guessing
61       * from the provided <i>CSV</i> file.
62       *
63       * @param is {@link InputStream} of the <i>CSV</i> file where guess the configuration.
64       * @return a {@link CSVParser}
65       * @throws java.io.IOException
66       */
67      public static CSVParser build(InputStream is) throws IOException {
68          CSVFormat bestStrategy = getBestStrategy(is);
69          if (bestStrategy == null)
70              bestStrategy = getCSVStrategyFromConfiguration();
71          return new CSVParser(new InputStreamReader(is), bestStrategy);
72      }
73  
74      /**
75       * Checks whether the given input stream is a CSV or not.
76       *
77       * @param is input stream to be verified.
78       * @return <code>true</code> if the given <code>is</code> input stream contains a <i>CSV</i> content.
79       *         <code>false</code> otherwise.
80       * @throws IOException
81       */
82      public static boolean isCSV(InputStream is) throws IOException {
83          return getBestStrategy(is) != null;
84      }
85  
86      private static CSVFormat getBestStrategy(InputStream is) throws IOException {
87          for (CSVFormat strategy : strategies) {
88              if (testStrategy(is, strategy)) {
89                  return strategy;
90              }
91          }
92          return null;
93      }
94  
95      private static CSVFormat getCSVStrategyFromConfiguration() {
96          char fieldDelimiter = getCharValueFromConfiguration(
97                  "any23.extraction.csv.field",
98                  DEFAULT_FIELD_DELIMITER
99          );
100         char commentDelimiter = getCharValueFromConfiguration(
101                 "any23.extraction.csv.comment",
102                 DEFAULT_COMMENT_DELIMITER
103         );
104         return CSVFormat.DEFAULT.withDelimiter(fieldDelimiter).withCommentMarker(commentDelimiter);
105     }
106 
107     private static char getCharValueFromConfiguration(String property, String defaultValue) {
108         String delimiter = defaultConfiguration.getProperty(
109                 property,
110                 defaultValue
111         );
112         if (delimiter.length() != 1) {
113             throw new RuntimeException(property + " value must be a single character");
114         }
115         return delimiter.charAt(0);
116     }
117 
118     /**
119      * make sure the reader has correct delimiter and quotation set.
120      * Check first lines and make sure they have the same amount of columns and at least 2
121      *
122      * @param is input stream to be checked
123      * @param strategy strategy to be verified.
124      * @return
125      * @throws IOException
126      * @param is
127      */
128     private static boolean testStrategy(InputStream is, CSVFormat strategy) throws IOException {
129         final int MIN_COLUMNS = 2;
130 
131         is.mark(Integer.MAX_VALUE);
132         try {
133             final Iterator<CSVRecord> rows = new CSVParser(new InputStreamReader(is), strategy).iterator();
134             int linesToCheck = 5;
135             int headerColumnCount = -1;
136             while (linesToCheck > 0 && rows.hasNext()) {
137                 int rowLength = rows.next().size();
138                 if (rowLength < MIN_COLUMNS) {
139                     return false;
140                 }
141                 if (headerColumnCount == -1) { // first row
142                     headerColumnCount = rowLength;
143                 } else { // make sure rows have the same number of columns or one more than the header
144                     if (rowLength < headerColumnCount) {
145                         return false;
146                     } else if (rowLength - 1 > headerColumnCount) {
147                         return false;
148                     }
149                 }
150                 linesToCheck--;
151             }
152             return true;
153         } finally {
154             is.reset();
155         }
156     }
157 
158 
159 }