View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor;
19  
20  import org.apache.any23.extractor.html.MicroformatExtractor;
21  import org.apache.any23.rdf.Prefixes;
22  import org.apache.any23.writer.TripleHandler;
23  import org.apache.any23.writer.TripleHandlerException;
24  import org.eclipse.rdf4j.model.BNode;
25  import org.eclipse.rdf4j.model.Resource;
26  import org.eclipse.rdf4j.model.IRI;
27  import org.eclipse.rdf4j.model.Value;
28  
29  import java.io.PrintStream;
30  import java.util.ArrayList;
31  import java.util.Collection;
32  import java.util.Collections;
33  import java.util.HashSet;
34  import java.util.List;
35  import java.util.Set;
36  
37  /**
38   * <p>
39   * A default implementation of {@link ExtractionResult}; it receives
40   * extraction output from one {@link Extractor} working on one document,
41   * and passes the output on to a {@link TripleHandler}. It deals with
42   * details such as creation of {@link ExtractionContext} objects
43   * and closing any open contexts at the end of extraction.
44   * </p>
45   * <p>
46   * The {@link #close()} method must be invoked after the extractor has
47   * finished processing.
48   * </p>
49   * <p>
50   * There is usually no need to provide additional implementations
51   * of the ExtractionWriter interface.
52   *</p>
53   *
54   * @see org.apache.any23.writer.TripleHandler
55   * @see ExtractionContext
56   * @author Richard Cyganiak (richard@cyganiak.de)
57   * @author Michele Mostarda (michele.mostarda@gmail.com)
58   */
59  public class ExtractionResultImpl implements TagSoupExtractionResult {
60  
61      private final ExtractionContext context;
62  
63      private final Extractor<?> extractor;
64  
65      private final TripleHandler tripleHandler;
66  
67      private final Collection<ExtractionResult> subResults = new ArrayList<>();
68  
69      private final Set<Object> knownContextIDs = new HashSet<>();
70  
71      private boolean isClosed = false;
72  
73      private boolean isInitialized = false;
74  
75      private List<Issue> issues;
76  
77      private List<ResourceRoot> resourceRoots;
78  
79      private List<PropertyPath> propertyPaths;
80  
81      public ExtractionResultImpl(
82              ExtractionContext context,
83              Extractor<?> extractor,
84              TripleHandler tripleHandler
85      ) {
86          this(context, extractor, tripleHandler, new ArrayList<>());
87      }
88  
89      private ExtractionResultImpl(
90              ExtractionContext context,
91              Extractor<?> extractor,
92              TripleHandler tripleHandler,
93              List<Issue> issues
94      ) {
95          if(context == null) {
96              throw new NullPointerException("context cannot be null.");
97          }
98          if(extractor == null) {
99              throw new NullPointerException("extractor cannot be null.");
100         }
101         if(tripleHandler == null) {
102             throw new NullPointerException("triple handler cannot be null.");
103         }
104 
105         this.extractor       = extractor;
106         this.tripleHandler   = tripleHandler;
107         this.context         = context;
108         this.issues          = issues;
109 
110         knownContextIDs.add( context.getUniqueID() );
111 
112         try {
113             // openContext() must be called before extraction begins
114             // so that BenchmarkTripleHandler can report accurate times.
115             // See https://issues.apache.org/jira/browse/ANY23-337
116             tripleHandler.openContext(context);
117         } catch (TripleHandlerException e) {
118             throw new RuntimeException("Error while opening context", e);
119         }
120     }
121 
122     public boolean hasIssues() {
123         return ! issues.isEmpty();
124     }
125 
126     public int getIssuesCount() {
127         return issues.size();
128     }
129 
130     @Override
131     public void printReport(PrintStream ps) {
132         ps.print(String.format("Context: %s [errors: %d] {\n", context, getIssuesCount()));
133         for (Issue issue : issues) {
134             ps.print(issue.toString());
135             ps.print("\n");
136         }
137         // Printing sub results.
138         for (ExtractionResult er : subResults) {
139             er.printReport(ps);
140         }
141         ps.print("}\n");
142     }
143 
144     @Override
145     public Collection<Issue> getIssues() {
146         return issues.isEmpty() ? Collections.<Issue>emptyList() : Collections.unmodifiableList(issues);
147     }
148 
149     @Override
150     public ExtractionResult openSubResult(ExtractionContext context) {
151         final String contextID = context.getUniqueID();
152         if (knownContextIDs.contains(contextID)) {
153             throw new IllegalArgumentException("Duplicate contextID: " + contextID);
154         }
155         knownContextIDs.add(contextID);
156 
157         checkOpen();
158         ExtractionResult result = new ExtractionResultImpl(context, extractor, tripleHandler, this.issues);
159         subResults.add(result);
160         return result;
161     }
162 
163     public ExtractionContext getExtractionContext() {
164         return context;
165     }
166 
167     @Override
168     public void writeTriple(Resource s, IRI p, Value o, IRI g) {
169         if (s == null || p == null || o == null) return;
170         // Check for misconstructed literals or BNodes, Sesame does not catch this.
171         if (s.stringValue() == null || p.stringValue() == null || o.stringValue() == null) {
172             throw new IllegalArgumentException("The statement arguments must be not null.");
173         }
174         checkOpen();
175         try {
176             tripleHandler.receiveTriple(s, p, o, g, context);
177         } catch (TripleHandlerException e) {
178             throw new RuntimeException(
179                     String.format("Error while receiving triple %s %s %s", s, p, o ),
180                     e
181             );
182         }
183     }
184 
185     boolean wasTouched() {
186         return isInitialized;
187     }
188 
189     @Override
190     public void writeTriple(Resource s, IRI p, Value o) {
191         writeTriple(s, p, o, null);
192     }
193 
194     @Override
195     public void writeNamespace(String prefix, String uri) {
196         checkOpen();
197         try {
198             tripleHandler.receiveNamespace(prefix, uri, context);
199         } catch (TripleHandlerException e) {
200             throw new RuntimeException(
201                     String.format("Error while writing namespace %s:%s", prefix, uri),
202                     e
203             );
204         }
205     }
206 
207     @Override
208     public void notifyIssue(IssueLevel level, String msg, long row, long col) {
209         issues.add(new Issue(level, msg, row, col));
210     }
211 
212     @Override
213     public void close() {
214         if (isClosed) return;
215         isClosed = true;
216         for (ExtractionResult subResult : subResults) {
217             subResult.close();
218         }
219         try {
220             tripleHandler.closeContext(context);
221         } catch (TripleHandlerException e) {
222             throw new RuntimeException("Error while opening context", e);
223         }
224     }
225 
226     private void checkOpen() {
227         if (!isInitialized) {
228             isInitialized = true;
229             Prefixes prefixes = extractor.getDescription().getPrefixes();
230             for (String prefix : prefixes.allPrefixes()) {
231                 try {
232                     tripleHandler.receiveNamespace(prefix, prefixes.getNamespaceIRIFor(prefix), context);
233                 } catch (TripleHandlerException e) {
234                     throw new RuntimeException(String.format("Error while writing namespace %s", prefix),
235                             e
236                     );
237                 }
238             }
239         }
240         if (isClosed) {
241             throw new IllegalStateException("Not open: " + context);
242         }
243     }
244 
245     @Override
246     public void addResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor) {
247         if(resourceRoots == null) {
248             resourceRoots = new ArrayList<>();
249         }
250         resourceRoots.add( new ResourceRoot(path, root, extractor) );
251     }
252 
253     @Override
254     public List<ResourceRoot> getResourceRoots() {
255         List<ResourceRoot> allRoots = new ArrayList<>();
256         if(resourceRoots != null) {
257             allRoots.addAll( resourceRoots );
258         }
259         for(ExtractionResult er : subResults) {
260             ExtractionResultImpl/org/apache/any23/extractor/ExtractionResultImpl.html#ExtractionResultImpl">ExtractionResultImpl eri = (ExtractionResultImpl) er;
261             if( eri.resourceRoots != null ) {
262                 allRoots.addAll( eri.resourceRoots );
263             }
264         }
265         return allRoots;
266     }
267 
268     @Override
269     public void addPropertyPath(
270             Class<? extends MicroformatExtractor> extractor,
271             Resource propertySubject,
272             Resource property,
273             BNode object,
274             String[] path
275     ) {
276         if(propertyPaths == null) {
277             propertyPaths = new ArrayList<>();
278         }
279         propertyPaths.add( new PropertyPath(path, propertySubject, property, object, extractor) );
280     }
281 
282     @Override
283     public List<PropertyPath> getPropertyPaths() {
284         List<PropertyPath> allPaths = new ArrayList<>();
285         if(propertyPaths != null) {
286             allPaths.addAll( propertyPaths );
287         }
288         for(ExtractionResult er : subResults) {
289             ExtractionResultImpl/org/apache/any23/extractor/ExtractionResultImpl.html#ExtractionResultImpl">ExtractionResultImpl eri = (ExtractionResultImpl) er;
290             if( eri.propertyPaths != null ) {
291                 allPaths.addAll( eri.propertyPaths );
292             }
293         }
294         return allPaths;
295     }
296 
297     @Override
298     public String toString() {
299         final StringBuilder sb = new StringBuilder();
300         sb.append(context.toString());
301         sb.append('\n');
302         if (issues != null) {
303             sb.append("Errors {\n");
304             for (Issue issue : issues) {
305                 sb.append('\t');
306                 sb.append(issue.toString());
307                 sb.append('\n');
308             }
309         }
310         sb.append("}\n");
311         return sb.toString();
312     }
313 
314 }