View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor;
19  
20  import org.apache.any23.extractor.html.MicroformatExtractor;
21  import org.apache.any23.rdf.Prefixes;
22  import org.apache.any23.writer.TripleHandler;
23  import org.apache.any23.writer.TripleHandlerException;
24  import org.openrdf.model.BNode;
25  import org.openrdf.model.Resource;
26  import org.openrdf.model.URI;
27  import org.openrdf.model.Value;
28  
29  import java.io.PrintStream;
30  import java.util.ArrayList;
31  import java.util.Collection;
32  import java.util.Collections;
33  import java.util.HashSet;
34  import java.util.List;
35  import java.util.Set;
36  
37  /**
38   * <p/>
39   * A default implementation of {@link ExtractionResult}; it receives
40   * extraction output from one {@link Extractor} working on one document,
41   * and passes the output on to a {@link TripleHandler}. It deals with
42   * details such as creation of {@link ExtractionContext} objects
43   * and closing any open contexts at the end of extraction.
44   * <p/>
45   * The {@link #close()} method must be invoked after the extractor has
46   * finished processing.
47   * <p/>
48   * There is usually no need to provide additional implementations
49   * of the ExtractionWriter interface.
50   * <p/>
51   *
52   * @see org.apache.any23.writer.TripleHandler
53   * @see ExtractionContext
54   * @author Richard Cyganiak (richard@cyganiak.de)
55   * @author Michele Mostarda (michele.mostarda@gmail.com)
56   */
57  public class ExtractionResultImpl implements TagSoupExtractionResult {
58  
59      private final ExtractionContext context;
60  
61      private final Extractor<?> extractor;
62  
63      private final TripleHandler tripleHandler;
64  
65      private final Collection<ExtractionResult> subResults = new ArrayList<ExtractionResult>();
66  
67      private final Set<Object> knownContextIDs = new HashSet<Object>();
68  
69      private boolean isClosed = false;
70  
71      private boolean isInitialized = false;
72  
73      private List<Issue> issues;
74  
75      private List<ResourceRoot> resourceRoots;
76  
77      private List<PropertyPath> propertyPaths;
78  
79      public ExtractionResultImpl(
80              ExtractionContext context,
81              Extractor<?> extractor,
82              TripleHandler tripleHandler
83      ) {
84          this(context, extractor, tripleHandler, new ArrayList<Issue>());
85      }
86  
87      private ExtractionResultImpl(
88              ExtractionContext context,
89              Extractor<?> extractor,
90              TripleHandler tripleHandler,
91              List<Issue> issues
92      ) {
93          if(context == null) {
94              throw new NullPointerException("context cannot be null.");
95          }
96          if(extractor == null) {
97              throw new NullPointerException("extractor cannot be null.");
98          }
99          if(tripleHandler == null) {
100             throw new NullPointerException("triple handler cannot be null.");
101         }
102 
103         this.extractor       = extractor;
104         this.tripleHandler   = tripleHandler;
105         this.context         = context;
106         this.issues          = issues;
107 
108         knownContextIDs.add( context.getUniqueID() );
109     }
110 
111     public boolean hasIssues() {
112         return ! issues.isEmpty();
113     }
114 
115     public int getIssuesCount() {
116         return issues.size();
117     }
118 
119     public void printReport(PrintStream ps) {
120         ps.print(String.format("Context: %s [errors: %d] {\n", context, getIssuesCount()));
121         for (Issue issue : issues) {
122             ps.print(issue.toString());
123             ps.print("\n");
124         }
125         // Printing sub results.
126         for (ExtractionResult er : subResults) {
127             er.printReport(ps);
128         }
129         ps.print("}\n");
130     }
131 
132     public Collection<Issue> getIssues() {
133         return issues.isEmpty() ? Collections.<Issue>emptyList() : Collections.unmodifiableList(issues);
134     }
135 
136     public ExtractionResult openSubResult(ExtractionContext context) {
137         final String contextID = context.getUniqueID();
138         if (knownContextIDs.contains(contextID)) {
139             throw new IllegalArgumentException("Duplicate contextID: " + contextID);
140         }
141         knownContextIDs.add(contextID);
142 
143         checkOpen();
144         ExtractionResult result = new ExtractionResultImpl(context, extractor, tripleHandler, this.issues);
145         subResults.add(result);
146         return result;
147     }
148 
149     public ExtractionContext getExtractionContext() {
150         return context;
151     }
152 
153     public void writeTriple(Resource s, URI p, Value o, URI g) {
154         if (s == null || p == null || o == null) return;
155         // Check for misconstructed literals or BNodes, Sesame does not catch this.
156         if (s.stringValue() == null || p.stringValue() == null || o.stringValue() == null) {
157             throw new IllegalArgumentException("The statement arguments must be not null.");
158         }
159         checkOpen();
160         try {
161             tripleHandler.receiveTriple(s, p, o, g, context);
162         } catch (TripleHandlerException e) {
163             throw new RuntimeException(
164                     String.format("Error while receiving triple %s %s %s", s, p, o ),
165                     e
166             );
167         }
168     }
169 
170     public void writeTriple(Resource s, URI p, Value o) {
171         writeTriple(s, p, o, null);
172     }
173 
174     public void writeNamespace(String prefix, String uri) {
175         checkOpen();
176         try {
177             tripleHandler.receiveNamespace(prefix, uri, context);
178         } catch (TripleHandlerException e) {
179             throw new RuntimeException(
180                     String.format("Error while writing namespace %s:%s", prefix, uri),
181                     e
182             );
183         }
184     }
185 
186     public void notifyIssue(IssueLevel level, String msg, int row, int col) {
187         issues.add(new Issue(level, msg, row, col));
188     }
189 
190     public void close() {
191         if (isClosed) return;
192         isClosed = true;
193         for (ExtractionResult subResult : subResults) {
194             subResult.close();
195         }
196         if (isInitialized) {
197             try {
198                 tripleHandler.closeContext(context);
199             } catch (TripleHandlerException e) {
200                 throw new RuntimeException("Error while opening context", e);
201             }
202         }
203     }
204 
205     private void checkOpen() {
206         if (!isInitialized) {
207             isInitialized = true;
208             try {
209                 tripleHandler.openContext(context);
210             } catch (TripleHandlerException e) {
211                 throw new RuntimeException("Error while opening context", e);
212             }
213             Prefixes prefixes = extractor.getDescription().getPrefixes();
214             for (String prefix : prefixes.allPrefixes()) {
215                 try {
216                     tripleHandler.receiveNamespace(prefix, prefixes.getNamespaceURIFor(prefix), context);
217                 } catch (TripleHandlerException e) {
218                     throw new RuntimeException(String.format("Error while writing namespace %s", prefix),
219                             e
220                     );
221                 }
222             }
223         }
224         if (isClosed) {
225             throw new IllegalStateException("Not open: " + context);
226         }
227     }
228 
229     public void addResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor) {
230         if(resourceRoots == null) {
231             resourceRoots = new ArrayList<ResourceRoot>();
232         }
233         resourceRoots.add( new ResourceRoot(path, root, extractor) );
234     }
235 
236     public List<ResourceRoot> getResourceRoots() {
237         List<ResourceRoot> allRoots = new ArrayList<ResourceRoot>();
238         if(resourceRoots != null) {
239             allRoots.addAll( resourceRoots );
240         }
241         for(ExtractionResult er : subResults) {
242             ExtractionResultImpl eri = (ExtractionResultImpl) er;
243             if( eri.resourceRoots != null ) {
244                 allRoots.addAll( eri.resourceRoots );
245             }
246         }
247         return allRoots;
248     }
249 
250     public void addPropertyPath(
251             Class<? extends MicroformatExtractor> extractor,
252             Resource propertySubject,
253             Resource property,
254             BNode object,
255             String[] path
256     ) {
257         if(propertyPaths == null) {
258             propertyPaths = new ArrayList<PropertyPath>();
259         }
260         propertyPaths.add( new PropertyPath(path, propertySubject, property, object, extractor) );
261     }
262 
263     public List<PropertyPath> getPropertyPaths() {
264         List<PropertyPath> allPaths = new ArrayList<PropertyPath>();
265         if(propertyPaths != null) {
266             allPaths.addAll( propertyPaths );
267         }
268         for(ExtractionResult er : subResults) {
269             ExtractionResultImpl eri = (ExtractionResultImpl) er;
270             if( eri.propertyPaths != null ) {
271                 allPaths.addAll( eri.propertyPaths );
272             }
273         }
274         return allPaths;
275     }
276 
277     @Override
278     public String toString() {
279         final StringBuilder sb = new StringBuilder();
280         sb.append(context.toString());
281         sb.append('\n');
282         if (issues != null) {
283             sb.append("Errors {\n");
284             for (Issue issue : issues) {
285                 sb.append('\t');
286                 sb.append(issue.toString());
287                 sb.append('\n');
288             }
289         }
290         sb.append("}\n");
291         return sb.toString();
292     }
293 
294 }