View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.plugin.officescraper;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.Extractor;
25  import org.apache.any23.extractor.ExtractorDescription;
26  import org.apache.any23.rdf.RDFUtils;
27  import org.apache.any23.vocab.Excel;
28  import org.apache.poi.hssf.usermodel.HSSFWorkbook;
29  import org.apache.poi.ss.usermodel.Cell;
30  import org.apache.poi.ss.usermodel.CellType;
31  import org.apache.poi.ss.usermodel.Row;
32  import org.apache.poi.ss.usermodel.Sheet;
33  import org.apache.poi.ss.usermodel.Workbook;
34  import org.apache.poi.xssf.usermodel.XSSFWorkbook;
35  import org.eclipse.rdf4j.model.IRI;
36  import org.eclipse.rdf4j.model.vocabulary.RDF;
37  
38  import java.io.IOException;
39  import java.io.InputStream;
40  
41  /**
42   * Implementation of {@link org.apache.any23.extractor.Extractor.ContentExtractor} able to process
43   * a <i>MS Excel 97-2007+</i> file format <i>.xls/.xlsx</i> and
44   * convert the detected content to triples.
45   * This extractor is based on
46   * <a href="http://poi.apache.org/spreadsheet/index.html">Apache POI-HSSF and POI-XSSF Java API</a>.
47   *
48   * @author Michele Mostarda (mostarda@fbk.eu)
49   */
50  public class ExcelExtractor implements Extractor.ContentExtractor {
51  
52      private static final Excel excel = Excel.getInstance();
53  
54      private boolean stopAtFirstError = false;
55  
56      public ExcelExtractor() {}
57  
58      public boolean isStopAtFirstError() {
59          return stopAtFirstError;
60      }
61  
62      @Override
63      public void setStopAtFirstError(boolean f) {
64          stopAtFirstError = f;
65      }
66  
67      @Override
68      public ExtractorDescription getDescription() {
69          return ExcelExtractorFactory.getDescriptionInstance();
70      }
71  
72      @Override
73      public void run(
74              ExtractionParameters extractionParameters,
75              ExtractionContext context,
76              InputStream in,
77              ExtractionResult er
78      ) throws IOException, ExtractionException {
79          try {
80              final IRI documentIRI = context.getDocumentIRI();
81              final Workbook workbook = createWorkbook(documentIRI, in);
82              processWorkbook(documentIRI, workbook, er);
83          } catch (Exception e) {
84              throw new ExtractionException("An error occurred while extracting MS Excel content.", e);
85          }
86      }
87  
88      // TODO: this should be done by Tika, the extractors should be split.
89      private Workbook createWorkbook(IRI document, InputStream is) throws IOException {
90          final String documentIRI = document.toString();
91          if (documentIRI.endsWith(".xlsx")) {
92              return new XSSFWorkbook(is);
93          } else if (documentIRI.endsWith("xls")) {
94              return new HSSFWorkbook(is);
95          } else {
96              throw new IllegalArgumentException("Unsupported extension for resource [" + documentIRI + "]");
97          }
98      }
99  
100     private void processWorkbook(IRI documentIRI, Workbook wb, ExtractionResult er) {
101         for (int sheetIndex = 0; sheetIndex < wb.getNumberOfSheets(); sheetIndex++) {
102             final Sheet sheet = wb.getSheetAt(sheetIndex);
103             final IRI sheetIRI = getSheetIRI(documentIRI, sheet);
104             er.writeTriple(documentIRI, excel.containsSheet, sheetIRI);
105             er.writeTriple(sheetIRI, RDF.TYPE, excel.sheet);
106             writeSheetMetadata(sheetIRI, sheet, er);
107             for (Row row : sheet) {
108                 final IRI rowIRI = getRowIRI(sheetIRI, row);
109                 er.writeTriple(sheetIRI, excel.containsRow, rowIRI);
110                 er.writeTriple(rowIRI, RDF.TYPE, excel.row);
111                 writeRowMetadata(rowIRI, row, er);
112                 for (Cell cell : row) {
113                     writeCell(rowIRI, cell, er);
114                 }
115             }
116         }
117     }
118 
119     private void writeSheetMetadata(IRI sheetIRI, Sheet sheet, ExtractionResult er) {
120         final String sheetName   = sheet.getSheetName();
121         final int    firstRowNum = sheet.getFirstRowNum();
122         final int    lastRowNum  = sheet.getLastRowNum();
123         er.writeTriple(sheetIRI, excel.sheetName, RDFUtils.literal(sheetName));
124         er.writeTriple(sheetIRI, excel.firstRow, RDFUtils.literal(firstRowNum));
125         er.writeTriple(sheetIRI, excel.lastRow, RDFUtils.literal(lastRowNum));
126     }
127 
128     private void writeRowMetadata(IRI rowIRI, Row row, ExtractionResult er) {
129         final int    firstCellNum = row.getFirstCellNum();
130         final int    lastCellNum  = row.getLastCellNum();
131         er.writeTriple(rowIRI, excel.firstCell , RDFUtils.literal(firstCellNum));
132         er.writeTriple(rowIRI, excel.lastCell  , RDFUtils.literal(lastCellNum ));
133     }
134 
135     private void writeCell(IRI rowIRI, Cell cell, ExtractionResult er) {
136         final IRI cellType = cellTypeToType(cell.getCellType());
137         if (cellType == null)
138             return; // Skip unsupported cells.
139         final IRI cellIRI = getCellIRI(rowIRI, cell);
140         er.writeTriple(rowIRI, excel.containsCell, cellIRI);
141         er.writeTriple(cellIRI, RDF.TYPE, excel.cell);
142         er.writeTriple(
143                 cellIRI,
144                 excel.cellValue,
145                 RDFUtils.literal(cell.getStringCellValue(), cellType)
146         );
147     }
148 
149     private IRI getSheetIRI(IRI documentIRI, Sheet sheet) {
150         return RDFUtils.iri(documentIRI.toString() + "/sheet/" + sheet.getSheetName());
151     }
152 
153     private IRI getRowIRI(IRI sheetIRI, Row row) {
154         return RDFUtils.iri(sheetIRI.toString() + "/" + row.getRowNum());
155     }
156 
157     private IRI getCellIRI(IRI rowIRI, Cell cell) {
158         return RDFUtils.iri(rowIRI +
159 		String.format("/%d/", cell.getColumnIndex()));
160     }
161 
162     private IRI cellTypeToType(CellType cellType) {
163         final String postfix;
164         if (cellType == null) {
165             postfix = null;
166         } else {
167             switch (cellType) {
168                 case STRING:
169                     postfix = "string";
170                     break;
171                 case BOOLEAN:
172                     postfix = "boolean";
173                     break;
174                 case NUMERIC:
175                     postfix = "numeric";
176                     break;
177                 default:
178                     postfix = null;
179             }
180         }
181         return postfix == null ? null : RDFUtils.iri(excel.getNamespace().toString() + postfix);
182     }
183 
184 
185 }