View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.csv;
19  
20  import static java.lang.Character.toUpperCase;
21  
22  import org.apache.any23.extractor.ExtractionContext;
23  import org.apache.any23.extractor.ExtractionException;
24  import org.apache.any23.extractor.ExtractionParameters;
25  import org.apache.any23.extractor.ExtractionResult;
26  import org.apache.any23.extractor.Extractor;
27  import org.apache.any23.extractor.ExtractorDescription;
28  import org.apache.any23.rdf.RDFUtils;
29  import org.apache.any23.vocab.CSV;
30  import org.apache.commons.csv.CSVParser;
31  import org.apache.commons.csv.CSVRecord;
32  import org.eclipse.rdf4j.model.IRI;
33  import org.eclipse.rdf4j.model.Value;
34  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
35  import org.eclipse.rdf4j.model.vocabulary.RDF;
36  import org.eclipse.rdf4j.model.vocabulary.RDFS;
37  import org.eclipse.rdf4j.model.vocabulary.XMLSchema;
38  
39  import java.io.IOException;
40  import java.io.InputStream;
41  import java.util.StringTokenizer;
42  import java.util.Iterator;
43  
44  /**
45   * This extractor produces <i>RDF</i> from a <i>CSV file</i> .
46   * It automatically detects fields <i>delimiter</i>. If not able uses
47   * the one provided in the <i>Any23</i> configuration.
48   *
49   * @see CSVReaderBuilder
50   * @author Davide Palmisano ( dpalmisano@gmail.com )
51   */
52  public class CSVExtractor implements Extractor.ContentExtractor {
53  
54      private CSVParser csvParser;
55  
56      private IRI[] headerIRIs;
57  
58      private CSV csv = CSV.getInstance();
59  
60      /**
61       * {@inheritDoc}
62       */
63      @Override
64      public void setStopAtFirstError(boolean f) {
65        //not implemented
66      }
67  
68      /**
69       * {@inheritDoc}
70       */
71      @Override
72      public void run(
73              ExtractionParameters extractionParameters,
74              ExtractionContext extractionContext,
75              InputStream in
76              , ExtractionResult out
77      ) throws IOException, ExtractionException {
78          final IRI documentIRI = extractionContext.getDocumentIRI();
79  
80          // build the parser
81          csvParser = CSVReaderBuilder.build(in);
82          Iterator<CSVRecord> rows = csvParser.iterator();
83  
84          // get the header and generate the IRIs for column names
85          CSVRecord header = rows.hasNext() ? rows.next() : null;
86          headerIRIs = processHeader(header, documentIRI);
87  
88          // write triples to describe properties
89          writeHeaderPropertiesMetadata(header, out);
90  
91          int index = 0;
92          while (rows.hasNext()) {
93              CSVRecord nextLine = rows.next();
94              IRI rowSubject = RDFUtils.iri(
95                      documentIRI.toString(),
96                      "row/" + index
97              );
98              // add a row type
99              out.writeTriple(rowSubject, RDF.TYPE, csv.rowType);
100             // for each row produce its statements
101             produceRowStatements(rowSubject, nextLine, out);
102             // link the row to the document
103             out.writeTriple(documentIRI, csv.row, rowSubject);
104             // the progressive row number
105             out.writeTriple(
106                     rowSubject,
107                     csv.rowPosition,
108                     SimpleValueFactory.getInstance().createLiteral(String.valueOf(index))
109             );
110             index++;
111         }
112         // add some CSV metadata such as the number of rows and columns
113         addTableMetadataStatements(
114                 documentIRI,
115                 out,
116                 index,
117                 headerIRIs.length
118         );
119     }
120 
121     /**
122      * Check whether a number is an integer.
123      *
124      * @param number
125      * @return
126      */
127     private boolean isInteger(String number) {
128         try {
129             Integer.valueOf(number);
130             return true;
131         } catch (NumberFormatException e) {
132             return false;
133         }
134     }
135 
136     /**
137      * Check whether a number is a float.
138      *
139      * @param number
140      * @return
141      */
142     private boolean isFloat(String number) {
143         try {
144             Float.valueOf(number);
145             return true;
146         } catch (NumberFormatException e) {
147             return false;
148         }
149     }
150 
151     /**
152      * It writes <i>RDF</i> statements representing properties of the header.
153      *
154      * @param header
155      * @param out
156      */
157     private void writeHeaderPropertiesMetadata(CSVRecord header, ExtractionResult out) {
158         int index = 0;
159         for (IRI singleHeader : headerIRIs) {
160             if (index > headerIRIs.length) {
161                 break;
162             }
163             String headerString = header.get(index);
164             if (!RDFUtils.isAbsoluteIRI(headerString)) {
165                 out.writeTriple(
166                         singleHeader,
167                         RDFS.LABEL,
168                         SimpleValueFactory.getInstance().createLiteral(headerString)
169                 );
170             }
171             out.writeTriple(
172                     singleHeader,
173                     csv.columnPosition,
174                     SimpleValueFactory.getInstance().createLiteral(String.valueOf(index), XMLSchema.INTEGER)
175             );
176             index++;
177         }
178     }
179 
180     /**
181      * It process the first row of the file, returning a list of {@link IRI}s representing
182      * the properties for each column. If a value of the header is an absolute <i>IRI</i>
183      * then it leave it as is. Otherwise the {@link org.apache.any23.vocab.CSV} vocabulary is used.
184      *
185      * @param header
186      * @return an array of {@link IRI}s identifying the column names.
187      */
188     private IRI[] processHeader(CSVRecord header, IRI documentIRI) {
189         if (header == null)
190             return new IRI[0];
191 
192         IRI[] result = new IRI[header.size()];
193         int index = 0;
194         for (String h : header) {
195             String candidate = h.trim();
196             if (RDFUtils.isAbsoluteIRI(candidate)) {
197                 result[index] = SimpleValueFactory.getInstance().createIRI(candidate);
198             } else {
199                 result[index] = normalize(candidate, documentIRI);
200             }
201             index++;
202         }
203         return result;
204     }
205 
206     private IRI normalize(String toBeNormalized, IRI documentIRI) {
207       String newToBeNormalized = toBeNormalized.trim().toLowerCase().replace("?", "").replace("&", "");
208 
209         StringBuilder result = new StringBuilder(documentIRI.toString());
210 
211         StringTokenizer tokenizer = new StringTokenizer(newToBeNormalized, " ");
212         while (tokenizer.hasMoreTokens()) {
213             String current = tokenizer.nextToken();
214 
215             result.append(toUpperCase(current.charAt(0))).append(current.substring(1));
216         }
217 
218         return SimpleValueFactory.getInstance().createIRI(result.toString());
219     }
220 
221     /**
222      * It writes on the provided {@link ExtractionResult}, the </>RDF statements</>
223      * representing the row <i>cell</i>. If a  row <i>cell</i> is an absolute <i>IRI</i>
224      * then an object property is written, literal otherwise.
225      *
226      * @param rowSubject
227      * @param values
228      * @param out
229      */
230     private void produceRowStatements(
231             IRI rowSubject,
232             CSVRecord values,
233             ExtractionResult out
234     ) {
235         int index = 0;
236         for (String cell : values) {
237             if (index >= headerIRIs.length) {
238                 // there are some row cells that don't have an associated column name
239                 break;
240             }
241             if ("".equals(cell)) {
242                 index++;
243                 continue;
244             }
245             IRI predicate = headerIRIs[index];
246             Value object = getObjectFromCell(cell);
247             out.writeTriple(rowSubject, predicate, object);
248             index++;
249         }
250     }
251 
252     private Value getObjectFromCell(String cell) {
253         Value object;
254         String newCell = cell.trim();
255         if (RDFUtils.isAbsoluteIRI(newCell)) {
256             object = SimpleValueFactory.getInstance().createIRI(newCell);
257         } else {
258             IRI datatype = XMLSchema.STRING;
259             if (isInteger(newCell)) {
260                 datatype = XMLSchema.INTEGER;
261             } else if(isFloat(newCell)) {
262                 datatype = XMLSchema.FLOAT;
263             }
264             object = SimpleValueFactory.getInstance().createLiteral(newCell, datatype);
265         }
266         return object;
267     }
268 
269     /**
270      * It writes on the provided {@link ExtractionResult} some <i>RDF Statements</i>
271      * on generic properties of the <i>CSV</i> file, such as number of rows and columns.
272      *
273      * @param documentIRI
274      * @param out
275      * @param numberOfRows
276      * @param numberOfColumns
277      */
278     private void addTableMetadataStatements(
279             IRI documentIRI,
280             ExtractionResult out,
281             int numberOfRows,
282             int numberOfColumns) {
283         out.writeTriple(
284                 documentIRI,
285                 csv.numberOfRows,
286                 SimpleValueFactory.getInstance().createLiteral(String.valueOf(numberOfRows), XMLSchema.INTEGER)
287         );
288         out.writeTriple(
289                 documentIRI,
290                 csv.numberOfColumns,
291                 SimpleValueFactory.getInstance().createLiteral(String.valueOf(numberOfColumns), XMLSchema.INTEGER)
292         );
293     }
294 
295     /**
296      * {@inheritDoc}
297      */
298     @Override
299     public ExtractorDescription getDescription() {
300         return CSVExtractorFactory.getDescriptionInstance();
301     }
302 }