View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdf;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.Extractor;
25  import org.apache.any23.extractor.IssueReport;
26  import org.eclipse.rdf4j.rio.RDFParser;
27  
28  import java.io.IOException;
29  import java.io.InputStream;
30  import java.io.PrintWriter;
31  import java.io.StringWriter;
32  
33  /**
34   * Base class for a generic <i>RDF</i> {@link org.apache.any23.extractor.Extractor.ContentExtractor}.
35   *
36   * @author Michele Mostarda (mostarda@fbk.eu)
37   * @author Hans Brende (hansbrende@apache.org)
38   */
39  public abstract class BaseRDFExtractor implements Extractor.ContentExtractor {
40  
41      private boolean verifyDataType;
42      private boolean stopAtFirstError;
43  
44      public BaseRDFExtractor() {
45          this(false, false);
46      }
47  
48      /**
49       * Constructor, allows to specify the validation and error handling policies.
50       *
51       * @param verifyDataType
52       *            if <code>true</code> the data types will be verified, if <code>false</code> will be ignored.
53       * @param stopAtFirstError
54       *            if <code>true</code> the parser will stop at first parsing error, if <code>false</code> will ignore
55       *            non blocking errors.
56       */
57      public BaseRDFExtractor(boolean verifyDataType, boolean stopAtFirstError) {
58          this.verifyDataType = verifyDataType;
59          this.stopAtFirstError = stopAtFirstError;
60      }
61  
62      protected abstract RDFParser getParser(ExtractionContext extractionContext, ExtractionResult extractionResult);
63  
64      public boolean isVerifyDataType() {
65          return verifyDataType;
66      }
67  
68      public void setVerifyDataType(boolean verifyDataType) {
69          this.verifyDataType = verifyDataType;
70      }
71  
72      public boolean isStopAtFirstError() {
73          return stopAtFirstError;
74      }
75  
76      @Override
77      public void setStopAtFirstError(boolean b) {
78          stopAtFirstError = b;
79      }
80  
81      @Override
82      public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, InputStream in,
83              ExtractionResult extractionResult) throws IOException, ExtractionException {
84          try {
85              final RDFParser parser = getParser(extractionContext, extractionResult);
86              parser.parse(in, extractionContext.getDocumentIRI().stringValue());
87          } catch (Exception ex) {
88              extractionResult.notifyIssue(IssueReport.IssueLevel.FATAL, toString(ex), -1, -1);
89          }
90      }
91  
92      // keep package-access to avoid backwards compatibility woes if protected (may move around later)
93      @SuppressWarnings("Duplicates")
94      static String toString(Throwable th) {
95          StringWriter writer = new StringWriter();
96          try (PrintWriter pw = new PrintWriter(writer)) {
97              th.printStackTrace(pw);
98          }
99          String string = writer.toString();
100         if (string.length() > 1024) {
101             return string.substring(0, 1021) + "...";
102         }
103         return string;
104     }
105 
106 }