View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor;
19  
20  import org.apache.any23.extractor.html.MicroformatExtractor;
21  import org.apache.any23.rdf.Prefixes;
22  import org.apache.any23.writer.TripleHandler;
23  import org.apache.any23.writer.TripleHandlerException;
24  import org.eclipse.rdf4j.model.BNode;
25  import org.eclipse.rdf4j.model.Resource;
26  import org.eclipse.rdf4j.model.IRI;
27  import org.eclipse.rdf4j.model.Value;
28  
29  import java.io.PrintStream;
30  import java.util.ArrayList;
31  import java.util.Collection;
32  import java.util.Collections;
33  import java.util.HashSet;
34  import java.util.List;
35  import java.util.Locale;
36  import java.util.Set;
37  
38  /**
39   * <p>
40   * A default implementation of {@link ExtractionResult}; it receives extraction output from one {@link Extractor}
41   * working on one document, and passes the output on to a {@link TripleHandler}. It deals with details such as creation
42   * of {@link ExtractionContext} objects and closing any open contexts at the end of extraction.
43   * </p>
44   * <p>
45   * The {@link #close()} method must be invoked after the extractor has finished processing.
46   * </p>
47   * <p>
48   * There is usually no need to provide additional implementations of the ExtractionWriter interface.
49   * </p>
50   *
51   * @see org.apache.any23.writer.TripleHandler
52   * @see ExtractionContext
53   * 
54   * @author Richard Cyganiak (richard@cyganiak.de)
55   * @author Michele Mostarda (michele.mostarda@gmail.com)
56   */
57  public class ExtractionResultImpl implements TagSoupExtractionResult {
58  
59      private final ExtractionContext context;
60  
61      private final Extractor<?> extractor;
62  
63      private final TripleHandler tripleHandler;
64  
65      private final Collection<ExtractionResult> subResults = new ArrayList<>();
66  
67      private final Set<Object> knownContextIDs = new HashSet<>();
68  
69      private boolean isClosed = false;
70  
71      private boolean isInitialized = false;
72  
73      private List<Issue> issues;
74  
75      private List<ResourceRoot> resourceRoots;
76  
77      private List<PropertyPath> propertyPaths;
78  
79      public ExtractionResultImpl(ExtractionContext context, Extractor<?> extractor, TripleHandler tripleHandler) {
80          this(context, extractor, tripleHandler, new ArrayList<>());
81      }
82  
83      private ExtractionResultImpl(ExtractionContext context, Extractor<?> extractor, TripleHandler tripleHandler,
84              List<Issue> issues) {
85          if (context == null) {
86              throw new NullPointerException("context cannot be null.");
87          }
88          if (extractor == null) {
89              throw new NullPointerException("extractor cannot be null.");
90          }
91          if (tripleHandler == null) {
92              throw new NullPointerException("triple handler cannot be null.");
93          }
94  
95          this.extractor = extractor;
96          this.tripleHandler = tripleHandler;
97          this.context = context;
98          this.issues = issues;
99  
100         knownContextIDs.add(context.getUniqueID());
101 
102         try {
103             // openContext() must be called before extraction begins
104             // so that BenchmarkTripleHandler can report accurate times.
105             // See https://issues.apache.org/jira/browse/ANY23-337
106             tripleHandler.openContext(context);
107         } catch (TripleHandlerException e) {
108             throw new RuntimeException("Error while opening context", e);
109         }
110     }
111 
112     public boolean hasIssues() {
113         return !issues.isEmpty();
114     }
115 
116     public int getIssuesCount() {
117         return issues.size();
118     }
119 
120     @Override
121     public void printReport(PrintStream ps) {
122         ps.print(String.format(Locale.ROOT, "Context: %s [errors: %d] {\n", context, getIssuesCount()));
123         for (Issue issue : issues) {
124             ps.print(issue.toString());
125             ps.print("\n");
126         }
127         // Printing sub results.
128         for (ExtractionResult er : subResults) {
129             er.printReport(ps);
130         }
131         ps.print("}\n");
132     }
133 
134     @Override
135     public Collection<Issue> getIssues() {
136         return issues.isEmpty() ? Collections.<Issue> emptyList() : Collections.unmodifiableList(issues);
137     }
138 
139     @Override
140     public ExtractionResult openSubResult(ExtractionContext context) {
141         final String contextID = context.getUniqueID();
142         if (knownContextIDs.contains(contextID)) {
143             throw new IllegalArgumentException("Duplicate contextID: " + contextID);
144         }
145         knownContextIDs.add(contextID);
146 
147         checkOpen();
148         ExtractionResult result = new ExtractionResultImpl(context, extractor, tripleHandler, this.issues);
149         subResults.add(result);
150         return result;
151     }
152 
153     public ExtractionContext getExtractionContext() {
154         return context;
155     }
156 
157     @Override
158     public void writeTriple(Resource s, IRI p, Value o, IRI g) {
159         if (s == null || p == null || o == null)
160             return;
161         // Check for misconstructed literals or BNodes, Sesame does not catch this.
162         if (s.stringValue() == null || p.stringValue() == null || o.stringValue() == null) {
163             throw new IllegalArgumentException("The statement arguments must be not null.");
164         }
165         checkOpen();
166         try {
167             tripleHandler.receiveTriple(s, p, o, g, context);
168         } catch (TripleHandlerException e) {
169             throw new RuntimeException(String.format(Locale.ROOT, "Error while receiving triple %s %s %s", s, p, o), e);
170         }
171     }
172 
173     boolean wasTouched() {
174         return isInitialized;
175     }
176 
177     @Override
178     public void writeTriple(Resource s, IRI p, Value o) {
179         writeTriple(s, p, o, null);
180     }
181 
182     @Override
183     public void writeNamespace(String prefix, String uri) {
184         checkOpen();
185         try {
186             tripleHandler.receiveNamespace(prefix, uri, context);
187         } catch (TripleHandlerException e) {
188             throw new RuntimeException(String.format(Locale.ROOT, "Error while writing namespace %s:%s", prefix, uri),
189                     e);
190         }
191     }
192 
193     @Override
194     public void notifyIssue(IssueLevel level, String msg, long row, long col) {
195         issues.add(new Issue(level, msg, row, col));
196     }
197 
198     @Override
199     public void close() {
200         if (isClosed)
201             return;
202         isClosed = true;
203         for (ExtractionResult subResult : subResults) {
204             subResult.close();
205         }
206         try {
207             tripleHandler.closeContext(context);
208         } catch (TripleHandlerException e) {
209             throw new RuntimeException("Error while opening context", e);
210         }
211     }
212 
213     private void checkOpen() {
214         if (!isInitialized) {
215             isInitialized = true;
216             Prefixes prefixes = extractor.getDescription().getPrefixes();
217             for (String prefix : prefixes.allPrefixes()) {
218                 try {
219                     tripleHandler.receiveNamespace(prefix, prefixes.getNamespaceIRIFor(prefix), context);
220                 } catch (TripleHandlerException e) {
221                     throw new RuntimeException(String.format(Locale.ROOT, "Error while writing namespace %s", prefix),
222                             e);
223                 }
224             }
225         }
226         if (isClosed) {
227             throw new IllegalStateException("Not open: " + context);
228         }
229     }
230 
231     @Override
232     public void addResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor) {
233         if (resourceRoots == null) {
234             resourceRoots = new ArrayList<>();
235         }
236         resourceRoots.add(new ResourceRoot(path, root, extractor));
237     }
238 
239     @Override
240     public List<ResourceRoot> getResourceRoots() {
241         List<ResourceRoot> allRoots = new ArrayList<>();
242         if (resourceRoots != null) {
243             allRoots.addAll(resourceRoots);
244         }
245         for (ExtractionResult er : subResults) {
246             ExtractionResultImpl/org/apache/any23/extractor/ExtractionResultImpl.html#ExtractionResultImpl">ExtractionResultImpl eri = (ExtractionResultImpl) er;
247             if (eri.resourceRoots != null) {
248                 allRoots.addAll(eri.resourceRoots);
249             }
250         }
251         return allRoots;
252     }
253 
254     @Override
255     public void addPropertyPath(Class<? extends MicroformatExtractor> extractor, Resource propertySubject,
256             Resource property, BNode object, String[] path) {
257         if (propertyPaths == null) {
258             propertyPaths = new ArrayList<>();
259         }
260         propertyPaths.add(new PropertyPath(path, propertySubject, property, object, extractor));
261     }
262 
263     @Override
264     public List<PropertyPath> getPropertyPaths() {
265         List<PropertyPath> allPaths = new ArrayList<>();
266         if (propertyPaths != null) {
267             allPaths.addAll(propertyPaths);
268         }
269         for (ExtractionResult er : subResults) {
270             ExtractionResultImpl/org/apache/any23/extractor/ExtractionResultImpl.html#ExtractionResultImpl">ExtractionResultImpl eri = (ExtractionResultImpl) er;
271             if (eri.propertyPaths != null) {
272                 allPaths.addAll(eri.propertyPaths);
273             }
274         }
275         return allPaths;
276     }
277 
278     @Override
279     public String toString() {
280         final StringBuilder sb = new StringBuilder();
281         sb.append(context.toString());
282         sb.append('\n');
283         if (issues != null) {
284             sb.append("Errors {\n");
285             for (Issue issue : issues) {
286                 sb.append('\t');
287                 sb.append(issue.toString());
288                 sb.append('\n');
289             }
290         }
291         sb.append("}\n");
292         return sb.toString();
293     }
294 
295 }