View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  package org.apache.any23.plugin.extractor.openie;
18  
19  import java.io.IOException;
20  import java.util.List;
21  
22  import javax.xml.transform.TransformerConfigurationException;
23  import javax.xml.transform.TransformerFactoryConfigurationError;
24  
25  import org.apache.any23.extractor.Extractor;
26  import org.apache.any23.extractor.IssueReport;
27  import org.apache.any23.extractor.ExtractionContext;
28  import org.apache.any23.extractor.ExtractorDescription;
29  import org.apache.any23.plugin.Author;
30  import org.apache.any23.rdf.RDFUtils;
31  import org.apache.any23.util.StreamUtils;
32  import org.apache.tika.Tika;
33  import org.apache.tika.exception.TikaException;
34  import org.eclipse.rdf4j.model.IRI;
35  import org.eclipse.rdf4j.model.Resource;
36  import org.eclipse.rdf4j.model.Value;
37  import org.eclipse.rdf4j.model.vocabulary.RDF;
38  import org.eclipse.rdf4j.model.vocabulary.RDFS;
39  import org.apache.any23.extractor.ExtractionException;
40  import org.apache.any23.extractor.ExtractionParameters;
41  import org.apache.any23.extractor.ExtractionResult;
42  
43  import org.slf4j.Logger;
44  import org.slf4j.LoggerFactory;
45  import org.w3c.dom.Document;
46  
47  import edu.knowitall.openie.Argument;
48  import edu.knowitall.openie.Instance;
49  import edu.knowitall.openie.OpenIE;
50  import edu.knowitall.tool.parse.ClearParser;
51  import edu.knowitall.tool.postag.ClearPostagger;
52  import edu.knowitall.tool.srl.ClearSrl;
53  import edu.knowitall.tool.tokenize.ClearTokenizer;
54  import scala.collection.JavaConversions;
55  import scala.collection.Seq;
56  
57  /**
58   * An <a href="https://github.com/allenai/openie-standalone">OpenIE</a> 
59   * extractor able to generate <i>RDF</i> statements from 
60   * sentences representing relations in the text.
61   */
62  @Author(name="Lewis John McGibbney (lewismc@apache.org)")
63  public class OpenIEExtractor implements Extractor.TagSoupDOMExtractor {
64  
65      private static final Logger LOG = LoggerFactory.getLogger(OpenIEExtractor.class);
66  
67      /**
68       * default constructor
69       */
70      public OpenIEExtractor() {
71          // default constructor
72      }
73  
74      /**
75       * @see org.apache.any23.extractor.Extractor#getDescription()
76       */
77      @Override
78      public ExtractorDescription getDescription() {
79          return OpenIEExtractorFactory.getDescriptionInstance();
80      }
81  
82      @Override
83      public void run(ExtractionParameters extractionParameters,
84              ExtractionContext context, Document in, ExtractionResult out)
85                      throws IOException, ExtractionException {
86  
87          Runtime runtime = Runtime.getRuntime();
88          long maxMemory = runtime.maxMemory();
89          //free up as much memory as possible before performing this calculation
90          runtime.gc();
91          long usedMemory = Math.max(0L, runtime.totalMemory() - runtime.freeMemory());
92          long availableMemory = maxMemory - usedMemory;
93          if (availableMemory < 4294967296L) {
94              out.notifyIssue(IssueReport.IssueLevel.FATAL,
95                      "Not enough heap space available to perform OpenIE extraction: "
96                              + (availableMemory/1048576L) + "/" + (maxMemory / 1048576L)
97                              + " MB. Requires 4096 MB.", -1, -1);
98              LOG.error("Increase JVM heap size when running OpenIE extractor. max=" + maxMemory + "; available=" + availableMemory);
99              return;
100         }
101 
102         IRI documentIRI = context.getDocumentIRI();
103         RDFUtils.iri(documentIRI.toString() + "root");
104         out.writeNamespace(RDF.PREFIX, RDF.NAMESPACE);
105         out.writeNamespace(RDFS.PREFIX, RDFS.NAMESPACE);
106         LOG.debug("Processing: {}", documentIRI.toString());
107 
108         OpenIE openIE = new OpenIE(
109                 new ClearParser(
110                         new ClearPostagger(
111                                 new ClearTokenizer())), new ClearSrl(), false, false);
112 
113         Seq<Instance> extractions = null;
114         Tika tika = new Tika();
115         try {
116             extractions = openIE.extract(tika.parseToString(StreamUtils.documentToInputStream(in)));
117         } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) {
118             LOG.error("Encountered error during OpenIE extraction.", e);
119         } catch (TikaException e) {
120             LOG.error("Encountered error whilst parsing InputStream with Tika.", e);
121         }
122 
123         List<Instance> listExtractions = JavaConversions.seqAsJavaList(extractions);
124         // for each extraction instance we can obtain a number of extraction elements
125         // instance.confidence() - a confidence value for the extraction itself
126         // instance.extr().context() - an optional representation of the context for this extraction
127         // instance.extr().arg1().text() - subject
128         // instance.extr().rel().text() - predicate
129         // instance.extr().arg2s().text() - object
130         String thresholdString;
131         try {
132             thresholdString = extractionParameters.getProperty("any23.extraction.openie.confidence.threshold");
133         } catch (RuntimeException e) {
134             thresholdString = null;
135         }
136         double threshold = thresholdString == null ? 0.5 : Double.parseDouble(thresholdString);
137         for(Instance instance : listExtractions) {
138             if (instance.confidence() > threshold) {
139                 List<Argument> listArg2s = JavaConversions.seqAsJavaList(instance.extr().arg2s());
140                 for(Argument argument : listArg2s) {
141                     Resource subject = RDFUtils.makeIRI(instance.extr().arg1().text(), documentIRI);
142                     IRI predicate = (IRI) RDFUtils.makeIRI(instance.extr().rel().text(), documentIRI);
143                     Value object = RDFUtils.toValue(argument.text());
144                     out.writeTriple(subject, predicate, object);
145                 }
146             }
147         }
148     }
149 }