View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.csv;
19  
20  import org.apache.any23.configuration.DefaultConfiguration;
21  import org.apache.commons.csv.CSVParser;
22  import org.apache.commons.csv.CSVFormat;
23  import org.apache.commons.csv.CSVRecord;
24  
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.io.InputStreamReader;
28  import java.nio.charset.StandardCharsets;
29  import java.util.Iterator;
30  
31  /**
32   * This class is responsible to build a reader first guessing the configuration from the file it self and then, if not
33   * successful, from the {@link org.apache.any23.configuration.DefaultConfiguration}.
34   *
35   * @author Davide Palmisano ( dpalmisano@gmail.com )
36   * @author Michele Mostarda ( michele.mostarda@gmail.com )
37   */
38  public class CSVReaderBuilder {
39  
40      private static final String DEFAULT_FIELD_DELIMITER = ",";
41  
42      private static final String DEFAULT_COMMENT_DELIMITER = "#";
43  
44      private static final char[] popularDelimiters = { '\t', '|', ',', ';' };
45  
46      private static DefaultConfiguration defaultConfiguration = DefaultConfiguration.singleton();
47  
48      private static final CSVFormat[] strategies;
49  
50      static {
51          strategies = new CSVFormat[popularDelimiters.length + 1];
52          strategies[0] = CSVFormat.DEFAULT;
53          int index = 1;
54          for (char dlmt : popularDelimiters) {
55              strategies[index++] = CSVFormat.DEFAULT.withDelimiter(dlmt);
56          }
57      }
58  
59      /**
60       * Builds a not <code>null</code> {@link org.apache.commons.csv.CSVParser} guessing from the provided <i>CSV</i>
61       * file.
62       *
63       * @param is
64       *            {@link InputStream} of the <i>CSV</i> file where guess the configuration.
65       * 
66       * @return a {@link CSVParser}
67       * 
68       * @throws java.io.IOException
69       *             if there is an error building the parser
70       */
71      public static CSVParser build(InputStream is) throws IOException {
72          CSVFormat bestStrategy = getBestStrategy(is);
73          if (bestStrategy == null)
74              bestStrategy = getCSVStrategyFromConfiguration();
75          return new CSVParser(new InputStreamReader(is, StandardCharsets.UTF_8), bestStrategy);
76      }
77  
78      /**
79       * Checks whether the given input stream is a CSV or not.
80       *
81       * @param is
82       *            input stream to be verified.
83       * 
84       * @return <code>true</code> if the given <code>is</code> input stream contains a <i>CSV</i> content.
85       *         <code>false</code> otherwise.
86       * 
87       * @throws IOException
88       *             if there is an error processing the input stream
89       */
90      public static boolean isCSV(InputStream is) throws IOException {
91          return getBestStrategy(is) != null;
92      }
93  
94      private static CSVFormat getBestStrategy(InputStream is) throws IOException {
95          for (CSVFormat strategy : strategies) {
96              if (testStrategy(is, strategy)) {
97                  return strategy;
98              }
99          }
100         return null;
101     }
102 
103     private static CSVFormat getCSVStrategyFromConfiguration() {
104         char fieldDelimiter = getCharValueFromConfiguration("any23.extraction.csv.field", DEFAULT_FIELD_DELIMITER);
105         char commentDelimiter = getCharValueFromConfiguration("any23.extraction.csv.comment",
106                 DEFAULT_COMMENT_DELIMITER);
107         return CSVFormat.DEFAULT.withDelimiter(fieldDelimiter).withCommentMarker(commentDelimiter);
108     }
109 
110     private static char getCharValueFromConfiguration(String property, String defaultValue) {
111         String delimiter = defaultConfiguration.getProperty(property, defaultValue);
112         if (delimiter.length() != 1) {
113             throw new RuntimeException(property + " value must be a single character");
114         }
115         return delimiter.charAt(0);
116     }
117 
118     /**
119      * make sure the reader has correct delimiter and quotation set. Check first lines and make sure they have the same
120      * amount of columns and at least 2
121      *
122      * @param is
123      *            input stream to be checked
124      * @param strategy
125      *            strategy to be verified.
126      * 
127      * @return
128      * 
129      * @throws IOException
130      * 
131      * @param is
132      */
133     private static boolean testStrategy(InputStream is, CSVFormat strategy) throws IOException {
134         final int MIN_COLUMNS = 2;
135 
136         is.mark(Integer.MAX_VALUE);
137         try {
138             @SuppressWarnings("resource")
139             final Iterator<CSVRecord> rows = new CSVParser(new InputStreamReader(is, StandardCharsets.UTF_8), strategy)
140                     .iterator();
141             int linesToCheck = 5;
142             int headerColumnCount = -1;
143             while (linesToCheck > 0 && rows.hasNext()) {
144                 int rowLength = rows.next().size();
145                 if (rowLength < MIN_COLUMNS) {
146                     return false;
147                 }
148                 if (headerColumnCount == -1) { // first row
149                     headerColumnCount = rowLength;
150                 } else { // make sure rows have the same number of columns or one more than the header
151                     if (rowLength < headerColumnCount) {
152                         return false;
153                     } else if (rowLength - 1 > headerColumnCount) {
154                         return false;
155                     }
156                 }
157                 linesToCheck--;
158             }
159             return true;
160         } finally {
161             is.reset();
162         }
163     }
164 
165 }