View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.csv;
19  
20  import static java.lang.Character.toUpperCase;
21  
22  import org.apache.any23.extractor.ExtractionContext;
23  import org.apache.any23.extractor.ExtractionException;
24  import org.apache.any23.extractor.ExtractionParameters;
25  import org.apache.any23.extractor.ExtractionResult;
26  import org.apache.any23.extractor.Extractor;
27  import org.apache.any23.extractor.ExtractorDescription;
28  import org.apache.any23.rdf.RDFUtils;
29  import org.apache.any23.vocab.CSV;
30  import org.apache.commons.csv.CSVParser;
31  import org.apache.commons.csv.CSVRecord;
32  import org.eclipse.rdf4j.model.IRI;
33  import org.eclipse.rdf4j.model.Value;
34  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
35  import org.eclipse.rdf4j.model.vocabulary.RDF;
36  import org.eclipse.rdf4j.model.vocabulary.RDFS;
37  import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
38  
39  import java.io.IOException;
40  import java.io.InputStream;
41  import java.util.StringTokenizer;
42  import java.util.Iterator;
43  import java.util.Locale;
44  
45  /**
46   * This extractor produces <i>RDF</i> from a <i>CSV file</i> . It automatically detects fields <i>delimiter</i>. If not
47   * able uses the one provided in the <i>Any23</i> configuration.
48   *
49   * @see CSVReaderBuilder
50   * 
51   * @author Davide Palmisano ( dpalmisano@gmail.com )
52   */
53  public class CSVExtractor implements Extractor.ContentExtractor {
54  
55      private CSVParser csvParser;
56  
57      private IRI[] headerIRIs;
58  
59      private CSV csv = CSV.getInstance();
60  
61      /**
62       * {@inheritDoc}
63       */
64      @Override
65      public void setStopAtFirstError(boolean f) {
66          // not implemented
67      }
68  
69      /**
70       * {@inheritDoc}
71       */
72      @Override
73      public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream in,
74              ExtractionResult out) throws IOException, ExtractionException {
75          final IRI documentIRI = extractionContext.getDocumentIRI();
76  
77          // build the parser
78          csvParser = CSVReaderBuilder.build(in);
79          Iterator<CSVRecord> rows = csvParser.iterator();
80  
81          // get the header and generate the IRIs for column names
82          CSVRecord header = rows.hasNext() ? rows.next() : null;
83          headerIRIs = processHeader(header, documentIRI);
84  
85          // write triples to describe properties
86          writeHeaderPropertiesMetadata(header, out);
87  
88          int index = 0;
89          while (rows.hasNext()) {
90              CSVRecord nextLine = rows.next();
91              IRI rowSubject = RDFUtils.iri(documentIRI.toString(), "row/" + index);
92              // add a row type
93              out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
94              // for each row produce its statements
95              produceRowStatements(rowSubject, nextLine, out);
96              // link the row to the document
97              out.writeTriple(documentIRI, csv.row, rowSubject);
98              // the progressive row number
99              out.writeTriple(rowSubject, csv.rowPosition,
100                     SimpleValueFactory.getInstance().createLiteral(String.valueOf(index)));
101             index++;
102         }
103         // add some CSV metadata such as the number of rows and columns
104         addTableMetadataStatements(documentIRI, out, index, headerIRIs.length);
105     }
106 
107     /**
108      * Check whether a number is an integer.
109      *
110      * @param number
111      * 
112      * @return
113      */
114     private boolean isInteger(String number) {
115         try {
116             Integer.valueOf(number);
117             return true;
118         } catch (NumberFormatException e) {
119             return false;
120         }
121     }
122 
123     /**
124      * Check whether a number is a float.
125      *
126      * @param number
127      * 
128      * @return
129      */
130     private boolean isFloat(String number) {
131         try {
132             Float.valueOf(number);
133             return true;
134         } catch (NumberFormatException e) {
135             return false;
136         }
137     }
138 
139     /**
140      * It writes <i>RDF</i> statements representing properties of the header.
141      *
142      * @param header
143      * @param out
144      */
145     private void writeHeaderPropertiesMetadata(CSVRecord header, ExtractionResult out) {
146         int index = 0;
147         for (IRI singleHeader : headerIRIs) {
148             if (index > headerIRIs.length) {
149                 break;
150             }
151             String headerString = header.get(index);
152             if (!RDFUtils.isAbsoluteIRI(headerString)) {
153                 out.writeTriple(singleHeader, RDFS.LABEL, SimpleValueFactory.getInstance().createLiteral(headerString));
154             }
155             out.writeTriple(singleHeader, csv.columnPosition,
156                     SimpleValueFactory.getInstance().createLiteral(String.valueOf(index), XMLSchema.INTEGER));
157             index++;
158         }
159     }
160 
161     /**
162      * It process the first row of the file, returning a list of {@link IRI}s representing the properties for each
163      * column. If a value of the header is an absolute <i>IRI</i> then it leave it as is. Otherwise the
164      * {@link org.apache.any23.vocab.CSV} vocabulary is used.
165      *
166      * @param header
167      * 
168      * @return an array of {@link IRI}s identifying the column names.
169      */
170     private IRI[] processHeader(CSVRecord header, IRI documentIRI) {
171         if (header == null)
172             return new IRI[0];
173 
174         IRI[] result = new IRI[header.size()];
175         int index = 0;
176         for (String h : header) {
177             String candidate = h.trim();
178             if (RDFUtils.isAbsoluteIRI(candidate)) {
179                 result[index] = SimpleValueFactory.getInstance().createIRI(candidate);
180             } else {
181                 result[index] = normalize(candidate, documentIRI);
182             }
183             index++;
184         }
185         return result;
186     }
187 
188     private IRI normalize(String toBeNormalized, IRI documentIRI) {
189         String newToBeNormalized = toBeNormalized.trim().toLowerCase(Locale.ROOT).replace("?", "").replace("&", "");
190 
191         StringBuilder result = new StringBuilder(documentIRI.toString());
192 
193         StringTokenizer tokenizer = new StringTokenizer(newToBeNormalized, " ");
194         while (tokenizer.hasMoreTokens()) {
195             String current = tokenizer.nextToken();
196 
197             result.append(toUpperCase(current.charAt(0))).append(current.substring(1));
198         }
199 
200         return SimpleValueFactory.getInstance().createIRI(result.toString());
201     }
202 
203     /**
204      * It writes on the provided {@link ExtractionResult}, the </>RDF statements</> representing the row <i>cell</i>. If
205      * a row <i>cell</i> is an absolute <i>IRI</i> then an object property is written, literal otherwise.
206      *
207      * @param rowSubject
208      * @param values
209      * @param out
210      */
211     private void produceRowStatements(IRI rowSubject, CSVRecord values, ExtractionResult out) {
212         int index = 0;
213         for (String cell : values) {
214             if (index >= headerIRIs.length) {
215                 // there are some row cells that don't have an associated column name
216                 break;
217             }
218             if ("".equals(cell)) {
219                 index++;
220                 continue;
221             }
222             IRI predicate = headerIRIs[index];
223             Value object = getObjectFromCell(cell);
224             out.writeTriple(rowSubject, predicate, object);
225             index++;
226         }
227     }
228 
229     private Value getObjectFromCell(String cell) {
230         Value object;
231         String newCell = cell.trim();
232         if (RDFUtils.isAbsoluteIRI(newCell)) {
233             object = SimpleValueFactory.getInstance().createIRI(newCell);
234         } else {
235             IRI datatype = XMLSchema.STRING;
236             if (isInteger(newCell)) {
237                 datatype = XMLSchema.INTEGER;
238             } else if (isFloat(newCell)) {
239                 datatype = XMLSchema.FLOAT;
240             }
241             object = SimpleValueFactory.getInstance().createLiteral(newCell, datatype);
242         }
243         return object;
244     }
245 
246     /**
247      * It writes on the provided {@link ExtractionResult} some <i>RDF Statements</i> on generic properties of the
248      * <i>CSV</i> file, such as number of rows and columns.
249      *
250      * @param documentIRI
251      * @param out
252      * @param numberOfRows
253      * @param numberOfColumns
254      */
255     private void addTableMetadataStatements(IRI documentIRI, ExtractionResult out, int numberOfRows,
256             int numberOfColumns) {
257         out.writeTriple(documentIRI, csv.numberOfRows,
258                 SimpleValueFactory.getInstance().createLiteral(String.valueOf(numberOfRows), XMLSchema.INTEGER));
259         out.writeTriple(documentIRI, csv.numberOfColumns,
260                 SimpleValueFactory.getInstance().createLiteral(String.valueOf(numberOfColumns), XMLSchema.INTEGER));
261     }
262 
263     /**
264      * {@inheritDoc}
265      */
266     @Override
267     public ExtractorDescription getDescription() {
268         return CSVExtractorFactory.getDescriptionInstance();
269     }
270 }