View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor;
19  
20  import org.apache.any23.configuration.Configuration;
21  import org.apache.any23.configuration.DefaultConfiguration;
22  import org.apache.any23.encoding.EncodingDetector;
23  import org.apache.any23.encoding.TikaEncodingDetector;
24  import org.apache.any23.extractor.html.DocumentReport;
25  import org.apache.any23.extractor.html.HTMLDocument;
26  import org.apache.any23.extractor.html.MicroformatExtractor;
27  import org.apache.any23.extractor.html.TagSoupParser;
28  import org.apache.any23.mime.MIMEType;
29  import org.apache.any23.mime.MIMETypeDetector;
30  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
31  import org.apache.any23.rdf.RDFUtils;
32  import org.apache.any23.source.DocumentSource;
33  import org.apache.any23.source.LocalCopyFactory;
34  import org.apache.any23.source.MemCopyFactory;
35  import org.apache.any23.validator.EmptyValidationReport;
36  import org.apache.any23.validator.ValidatorException;
37  import org.apache.any23.vocab.SINDICE;
38  import org.apache.any23.writer.CompositeTripleHandler;
39  import org.apache.any23.writer.CountingTripleHandler;
40  import org.apache.any23.writer.TripleHandler;
41  import org.apache.any23.writer.TripleHandlerException;
42  import org.apache.any23.extractor.Extractor.BlindExtractor;
43  import org.apache.any23.extractor.Extractor.ContentExtractor;
44  import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
45  import org.openrdf.model.BNode;
46  import org.openrdf.model.URI;
47  import org.openrdf.model.impl.URIImpl;
48  import org.openrdf.model.impl.ValueFactoryImpl;
49  import org.slf4j.Logger;
50  import org.slf4j.LoggerFactory;
51  
52  import java.io.BufferedInputStream;
53  import java.io.ByteArrayOutputStream;
54  import java.io.IOException;
55  import java.io.InputStream;
56  import java.io.PrintStream;
57  import java.net.URISyntaxException;
58  import java.util.ArrayList;
59  import java.util.Collection;
60  import java.util.Collections;
61  import java.util.Date;
62  import java.util.HashMap;
63  import java.util.List;
64  import java.util.Map;
65  import java.util.UUID;
66  
67  import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
68  import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
69  
70  /**
71   * This class acts as facade where all the extractors were called on a single document.
72   */
73  public class SingleDocumentExtraction {
74  
75      private static final SINDICE vSINDICE = SINDICE.getInstance();
76  
77      private final static Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);
78  
79      private final Configuration configuration;
80  
81      private final DocumentSource in;
82  
83      private URI documentURI;
84      
85      private final ExtractorGroup extractors;
86  
87      private final TripleHandler output;
88  
89      private final EncodingDetector encoderDetector;
90  
91      private LocalCopyFactory copyFactory = null;
92  
93      private DocumentSource localDocumentSource = null;
94  
95      private MIMETypeDetector detector = null;
96  
97      private ExtractorGroup matchingExtractors = null;
98  
99      private MIMEType detectedMIMEType = null;
100 
101     private DocumentReport documentReport = null;
102 
103     private ExtractionParameters tagSoupDOMRelatedParameters = null;
104 
105     private String parserEncoding = null;
106 
107     /**
108      * Builds an extractor by the specification of document source,
109      * list of extractors and output triple handler.
110      *
111      * @param configuration configuration applied during extraction.
112      * @param in input document source.
113      * @param extractors list of extractors to be applied.
114      * @param output output triple handler.
115      */
116     public SingleDocumentExtraction(
117             Configuration configuration, DocumentSource in, ExtractorGroup extractors, TripleHandler output
118     ) {
119         if(configuration == null) throw new NullPointerException("configuration cannot be null.");
120         if(in == null)            throw new NullPointerException("in cannot be null.");
121         this.configuration = configuration;
122         this.in = in;
123         this.extractors = extractors;
124 
125         List<TripleHandler> tripleHandlers = new ArrayList<TripleHandler>();
126         tripleHandlers.add(output);
127         tripleHandlers.add(new CountingTripleHandler());
128         this.output = new CompositeTripleHandler(tripleHandlers);
129         this.encoderDetector = new TikaEncodingDetector();
130     }
131 
132     /**
133      * Builds an extractor by the specification of document source,
134      * extractors factory and output triple handler.
135      *
136      * @param configuration configuration applied during extraction.
137      * @param in input document source.
138      * @param factory the extractors factory.
139      * @param output output triple handler.
140      */
141     public SingleDocumentExtraction(
142             Configuration configuration, DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
143     ) {
144         this(
145                 configuration,
146                 in,
147                 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
148                 output
149         );
150         this.setMIMETypeDetector(null);
151     }
152 
153     /**
154      * Builds an extractor by the specification of document source,
155      * extractors factory and output triple handler, using the
156      * {@link org.apache.any23.configuration.DefaultConfiguration}.
157      *
158      * @param in input document source.
159      * @param factory the extractors factory.
160      * @param output output triple handler.
161      */
162     public SingleDocumentExtraction(
163         DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
164     ) {
165         this(
166                 DefaultConfiguration.singleton(),
167                 in,
168                 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
169                 output
170         );
171         this.setMIMETypeDetector(null);
172     }
173 
174     /**
175      * Sets the internal factory for generating the document local copy,
176      * if <code>null</code> the {@link org.apache.any23.source.MemCopyFactory} will be used.
177      *
178      * @param copyFactory local copy factory.
179      * @see org.apache.any23.source.DocumentSource
180      */
181     public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
182         this.copyFactory = copyFactory;
183     }
184 
185     /**
186      * Sets the internal mime type detector,
187      * if <code>null</code> mimetype detection will
188      * be skipped and all extractors will be activated.
189      *
190      * @param detector detector instance.
191      */
192     public void setMIMETypeDetector(MIMETypeDetector detector) {
193         this.detector = detector;
194     }
195 
196     /**
197      * Triggers the execution of all the {@link Extractor}
198      * registered to this class using the specified extraction parameters.
199      *
200      * @param extractionParameters the parameters applied to the run execution.
201      * @return the report generated by the extraction.
202      * @throws ExtractionException if an error occurred during the data extraction.
203      * @throws IOException if an error occurred during the data access.
204      */
205     public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters)
206     throws ExtractionException, IOException {
207         if(extractionParameters == null) {
208             extractionParameters = ExtractionParameters.newDefault(configuration);
209         }
210 
211         final String contextURI = extractionParameters.getProperty(ExtractionParameters.EXTRACTION_CONTEXT_URI_PROPERTY);
212         ensureHasLocalCopy();
213         try {
214             this.documentURI = new Any23ValueFactoryWrapper(
215                     ValueFactoryImpl.getInstance()
216             ).createURI( "?".equals(contextURI) ? in.getDocumentURI() : contextURI);
217         } catch (Exception ex) {
218             throw new IllegalArgumentException("Invalid URI: " + in.getDocumentURI(), ex);
219         }
220         if(log.isInfoEnabled()) {
221             log.info("Processing " + this.documentURI);
222         }
223         filterExtractorsByMIMEType();
224 
225         if(log.isDebugEnabled()) {
226             StringBuffer sb = new StringBuffer("Extractors ");
227             for (ExtractorFactory<?> factory : matchingExtractors) {
228                 sb.append(factory.getExtractorName());
229                 sb.append(' ');
230             }
231             sb.append("match ").append(documentURI);
232             log.debug(sb.toString());
233         }
234 
235         // Invoke all extractors.
236         try {
237             output.startDocument(documentURI);
238         } catch (TripleHandlerException e) {
239             log.error(String.format("Error starting document with URI %s", documentURI));
240             throw new ExtractionException(String.format("Error starting document with URI %s", documentURI),
241                     e
242             );
243         }
244         output.setContentLength(in.getContentLength());
245         // Create the document context.
246         final List<ResourceRoot> resourceRoots = new ArrayList<ResourceRoot>();
247         final List<PropertyPath> propertyPaths = new ArrayList<PropertyPath>();
248         final Map<String,Collection<IssueReport.Issue>> extractorToIssues =
249             new HashMap<String,Collection<IssueReport.Issue>>();
250         try {
251             final String documentLanguage = extractDocumentLanguage(extractionParameters);
252             for (ExtractorFactory<?> factory : matchingExtractors) {
253                 final Extractor extractor = factory.createExtractor();
254                 final SingleExtractionReport er = runExtractor(
255                         extractionParameters,
256                         documentLanguage,
257                         extractor
258                 );
259                 resourceRoots.addAll( er.resourceRoots );
260                 propertyPaths.addAll( er.propertyPaths );
261                 extractorToIssues.put(factory.getExtractorName(), er.issues);
262             }
263         } catch(ValidatorException ve) {
264             throw new ExtractionException("An error occurred during the validation phase.", ve);
265         }
266 
267         // Resource consolidation.
268         final boolean addDomainTriples = extractionParameters.getFlag(ExtractionParameters.METADATA_DOMAIN_PER_ENTITY_FLAG);
269         final ExtractionContext consolidationContext;
270         if(extractionParameters.getFlag(ExtractionParameters.METADATA_NESTING_FLAG)) {
271             // Consolidation with nesting.
272             consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output);
273         } else {
274             consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output);
275         }
276 
277         // Adding time/size meta triples.
278         if (extractionParameters.getFlag(ExtractionParameters.METADATA_TIMESIZE_FLAG)) {
279             try {
280                 addExtractionTimeSizeMetaTriples(consolidationContext);
281             } catch (TripleHandlerException e) {
282                 throw new ExtractionException(
283                         String.format(
284                                 "Error while adding extraction metadata triples document with URI %s", documentURI
285                         ),
286                         e
287                 );
288             }
289         }
290 
291         try {
292             output.endDocument(documentURI);
293         } catch (TripleHandlerException e) {
294             log.error(String.format("Error ending document with URI %s", documentURI));
295             throw new ExtractionException(String.format("Error ending document with URI %s", documentURI),
296                     e
297             );
298         }
299 
300         return new SingleDocumentExtractionReport(
301                 documentReport == null
302                         ?
303                 EmptyValidationReport.getInstance() : documentReport.getReport(),
304                 extractorToIssues
305         );
306     }
307 
308     /**
309      * Triggers the execution of all the {@link Extractor}
310      * registered to this class using the <i>default</i> extraction parameters.
311      *
312      * @throws IOException
313      * @throws ExtractionException
314      * @return the extraction report.
315      */
316     public SingleDocumentExtractionReport run() throws IOException, ExtractionException {
317         return run(ExtractionParameters.newDefault(configuration));
318     }
319 
320     /**
321      * Returns the detected mimetype for the given {@link org.apache.any23.source.DocumentSource}.
322      *
323      * @return string containing the detected mimetype.
324      * @throws IOException if an error occurred while accessing the data.
325      */
326     public String getDetectedMIMEType() throws IOException {
327         filterExtractorsByMIMEType();
328         return  detectedMIMEType == null ? null : detectedMIMEType.toString();
329     }
330 
331     /**
332      * Check whether the given {@link org.apache.any23.source.DocumentSource} content activates of not at least an extractor.
333      *
334      * @return <code>true</code> if at least an extractor is activated, <code>false</code> otherwise.
335      * @throws IOException
336      */
337     public boolean hasMatchingExtractors() throws IOException {
338         filterExtractorsByMIMEType();
339         return !matchingExtractors.isEmpty();
340     }
341 
342     /**
343      * @return the list of all the activated extractors for the given {@link org.apache.any23.source.DocumentSource}.
344      */
345     public List<Extractor> getMatchingExtractors() {
346         final List<Extractor> extractorsList = new ArrayList<Extractor>();
347         for(ExtractorFactory extractorFactory : matchingExtractors) {
348             extractorsList.add( extractorFactory.createExtractor() );
349         }
350         return extractorsList;
351     }
352 
353     /**
354      * @return the configured parsing encoding.
355      */
356     public String getParserEncoding() {
357         if(this.parserEncoding == null) {
358             this.parserEncoding = detectEncoding();
359         }
360         return this.parserEncoding;
361     }
362 
363     /**
364      * Sets the document parser encoding.
365      *
366      * @param encoding parser encoding.
367      */
368     public void setParserEncoding(String encoding) {
369         this.parserEncoding = encoding;
370         documentReport = null;
371     }
372 
373     /**
374      * Chech whether the given {@link org.apache.any23.source.DocumentSource} is an <b>HTML</b> document.
375      *
376      * @return <code>true</code> if the document source is an HTML document.
377      * @throws IOException if an error occurs while accessing data.
378      */
379     private boolean isHTMLDocument() throws IOException {
380         filterExtractorsByMIMEType();
381         return ! matchingExtractors.filterByMIMEType( MIMEType.parse("text/html") ).isEmpty();
382     }
383 
384     /**
385      * Extracts the document language where possible.
386      *
387      * @param extractionParameters extraction parameters to be applied to determine the document language.
388      * @return the document language if any, <code>null</code> otherwise.
389      * @throws java.io.IOException if an error occurs during the document analysis.
390      * @throws org.apache.any23.validator.ValidatorException
391      */
392     private String extractDocumentLanguage(ExtractionParameters extractionParameters)
393     throws IOException, ValidatorException {
394         if( ! isHTMLDocument() ) {
395             return null;
396         }
397         final HTMLDocument document;
398         try {
399             document = new HTMLDocument( getTagSoupDOM(extractionParameters).getDocument() );
400         } catch (IOException ioe) {
401             log.debug("Cannot extract language from document.", ioe);
402             return null;
403         }
404         return document.getDefaultLanguage();
405     }
406 
407     /**
408      * Generates a list of extractors that can be applied to the given document.
409      *
410      * @throws IOException
411      */
412     private void filterExtractorsByMIMEType()
413     throws IOException {
414         if (matchingExtractors != null) return;  // has already been run.
415 
416         if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
417             matchingExtractors = extractors;
418             return;
419         }
420         ensureHasLocalCopy();
421         detectedMIMEType = detector.guessMIMEType(
422                 java.net.URI.create(documentURI.stringValue()).getPath(),
423                 localDocumentSource.openInputStream(),
424                 MIMEType.parse(localDocumentSource.getContentType())
425         );
426         log.debug("detected media type: " + detectedMIMEType);
427         matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
428     }
429 
430     /**
431      * Triggers the execution of a specific {@link Extractor}.
432      * 
433      * @param extractionParameters the parameters used for the extraction.
434      * @param extractor the {@link Extractor} to be executed.
435      * @throws ExtractionException if an error specific to an extractor happens.
436      * @throws IOException if an IO error occurs during the extraction.
437      * @return the roots of the resources that have been extracted.
438      * @throws org.apache.any23.validator.ValidatorException if an error occurs during validation.
439      */
440     private SingleExtractionReport runExtractor(
441             final ExtractionParameters extractionParameters,
442             final String documentLanguage,
443             final Extractor<?> extractor
444     ) throws ExtractionException, IOException, ValidatorException {
445         if(log.isDebugEnabled()) {
446             log.debug("Running " + extractor.getDescription().getExtractorName() + " on " + documentURI);
447         }
448         long startTime = System.currentTimeMillis();
449         final ExtractionContext extractionContext = new ExtractionContext(
450                 extractor.getDescription().getExtractorName(),
451                 documentURI,
452                 documentLanguage
453         );
454         final ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output);
455         try {
456             if (extractor instanceof BlindExtractor) {
457                 final BlindExtractor blindExtractor = (BlindExtractor) extractor;
458                 blindExtractor.run(extractionParameters, extractionContext, documentURI, extractionResult);
459             } else if (extractor instanceof ContentExtractor) {
460                 ensureHasLocalCopy();
461                 final ContentExtractor contentExtractor = (ContentExtractor) extractor;
462                 contentExtractor.run(
463                         extractionParameters,
464                         extractionContext,
465                         localDocumentSource.openInputStream(),
466                         extractionResult
467                 );
468             } else if (extractor instanceof TagSoupDOMExtractor) {
469                 final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor;
470                 final DocumentReport documentReport = getTagSoupDOM(extractionParameters);
471                 tagSoupDOMExtractor.run(
472                         extractionParameters,
473                         extractionContext,
474                         documentReport.getDocument(),
475                         extractionResult
476                 );
477             } else {
478                 throw new IllegalStateException("Extractor type not supported: " + extractor.getClass());
479             }
480             return
481                 new SingleExtractionReport(
482                     extractionResult.getIssues(),
483                     new ArrayList<ResourceRoot>( extractionResult.getResourceRoots() ),
484                     new ArrayList<PropertyPath>( extractionResult.getPropertyPaths() )
485                 );
486         } catch (ExtractionException ex) {
487             if(log.isDebugEnabled()) {
488                 log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
489             }
490             throw ex;
491         } finally {
492             // Logging result error report.
493             if(log.isDebugEnabled() && extractionResult.hasIssues() ) {
494                 ByteArrayOutputStream baos = new ByteArrayOutputStream();
495                 extractionResult.printReport(new PrintStream(baos));
496                 log.debug(baos.toString());
497             }
498             extractionResult.close();
499 
500             long elapsed = System.currentTimeMillis() - startTime;
501             if(log.isDebugEnabled()) {
502                 log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
503             }
504         }
505     }
506 
507     /**
508      * Forces the retrieval of the document data.
509      *
510      * @throws IOException
511      */
512     private void ensureHasLocalCopy() throws IOException {
513         if (localDocumentSource != null) return;
514         if (in.isLocal()) {
515             localDocumentSource = in;
516             return;
517         }
518         if (copyFactory == null) {
519             copyFactory = new MemCopyFactory();
520         }
521         localDocumentSource = copyFactory.createLocalCopy(in);
522     }
523 
524     /**
525      * Returns the DOM of the given document source (that must be an HTML stream)
526      * and the report of eventual fixes applied on it.
527      *
528      * @param extractionParameters parameters to be used during extraction.
529      * @return document report.
530      * @throws IOException if an error occurs during data access.
531      * @throws ValidatorException if an error occurs during validation.
532      */
533     private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters)
534     throws IOException, ValidatorException {
535         if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) {
536             ensureHasLocalCopy();
537             final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
538             is.mark(Integer.MAX_VALUE);
539             final String candidateEncoding = getParserEncoding();
540             is.reset();
541             final TagSoupParser tagSoupParser = new TagSoupParser(
542                     is,
543                     documentURI.stringValue(),
544                     candidateEncoding
545             );
546             if(extractionParameters.isValidate()) {
547                 documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() );
548             } else {
549                 documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() );
550             }
551             tagSoupDOMRelatedParameters = extractionParameters;
552         }
553         return documentReport;
554     }
555 
556     /**
557      * Detects the encoding of the local document source input stream.
558      * 
559      * @return a valid encoding value.
560      */
561     private String detectEncoding() {
562         try {
563             ensureHasLocalCopy();
564             InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
565             String encoding = this.encoderDetector.guessEncoding(is);
566             is.close();
567             return encoding;
568         } catch (Exception e) {
569             throw new RuntimeException("An error occurred while trying to detect the input encoding.", e);
570         }
571     }
572 
573     /**
574      * This function verifies if the <i>candidateSub</i> list of strings
575      * is a prefix of <i>list</i>.
576      *
577      * @param list a list of strings.
578      * @param candidateSub a list of strings.
579      * @return <code>true</code> if <i>candidateSub</i> is a sub path of <i>list</i>,
580      *         <code>false</code> otherwise.
581      */
582     private boolean subPath(String[] list, String[] candidateSub) {
583         if(candidateSub.length > list.length) {
584             return false;
585         }
586         for(int i = 0; i < candidateSub.length; i++) {
587             if( ! candidateSub[i].equals(list[i])) {
588                 return false;
589             }
590         }
591         return true;
592     }
593 
594     /**
595      * Adds for every resource root node a page domain triple.
596      *
597      * @param resourceRoots list of resource roots.
598      * @param context extraction context to produce triples.
599      * @throws ExtractionException
600      */
601     private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context)
602     throws ExtractionException {
603         try {
604             // Add source Web domains to every resource root.
605             String domain;
606             try {
607                 domain = new java.net.URI(in.getDocumentURI()).getHost();
608             } catch (URISyntaxException urise) {
609                 throw new IllegalArgumentException(
610                         "An error occurred while extracting the host from the document URI.",
611                         urise
612                 );
613             }
614             if (domain != null) {
615                 for (ResourceRoot resourceRoot : resourceRoots) {
616                     output.receiveTriple(
617                             resourceRoot.getRoot(),
618                             vSINDICE.getProperty(SINDICE.DOMAIN),
619                             ValueFactoryImpl.getInstance().createLiteral(domain),
620                             null,
621                             context
622                     );
623                 }
624             }
625         } catch (TripleHandlerException e) {
626             throw new ExtractionException("Error while writing triple triple.", e);
627         } finally {
628             try {
629                 output.closeContext(context);
630             } catch (TripleHandlerException e) {
631                 throw new ExtractionException("Error while closing context.", e);
632             }
633         }
634     }
635 
636     /**
637      * @return an extraction context specific for consolidation triples.
638      */
639     private ExtractionContext createExtractionContext() {
640         return new ExtractionContext(
641                 "consolidation-extractor",
642                 documentURI,
643                 UUID.randomUUID().toString()
644         );
645     }
646 
647     /**
648      * Detect the nesting relationship among different
649      * Microformats and explicit them adding connection triples.
650      *
651      * @param resourceRoots
652      * @param propertyPaths
653      * @param context
654      * @throws TripleHandlerException
655      */
656     private void addNestingRelationship(
657             List<ResourceRoot> resourceRoots,
658             List<PropertyPath> propertyPaths,
659             ExtractionContext context
660     ) throws TripleHandlerException {
661         ResourceRoot currentResourceRoot;
662         PropertyPath currentPropertyPath;
663         for (int r = 0; r < resourceRoots.size(); r++) {
664             currentResourceRoot = resourceRoots.get(r);
665             for (int p = 0; p < propertyPaths.size(); p++) {
666                 currentPropertyPath = propertyPaths.get(p);
667                 Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor();
668                 Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor();
669                 // Avoid wrong nesting relationships.
670                 if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) {
671                     continue;
672                 }
673                 // Avoid self declaring relationships
674                 if(MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) {
675                     continue;
676                 }
677                 if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) {
678                     createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context);
679                 }
680             }
681         }
682     }
683 
684     /**
685      * This method consolidates the graphs extracted from the same document.
686      * In particular it adds:
687      * <ul>
688      *   <li>for every microformat root node a triple indicating the original Web page domain;</li>
689      *   <li>triples indicating the nesting relationship among a microformat root and property paths of
690      *       other nested microformats.
691      *   </li>
692      * </ul>
693      * @param resourceRoots list of RDF nodes representing roots of
694      *        extracted microformat graphs and the corresponding HTML paths.
695      * @param propertyPaths list of RDF nodes representing property subjects, property URIs and the HTML paths
696      *        from which such properties have been extracted. 
697      * @param addDomainTriples
698      * @param output a triple handler event collector.
699      * @return
700      * @throws ExtractionException
701      */
702     private ExtractionContext consolidateResources(
703             List<ResourceRoot> resourceRoots,
704             List<PropertyPath> propertyPaths,
705             boolean addDomainTriples,
706             TripleHandler output
707     ) throws ExtractionException {
708         final ExtractionContext context = createExtractionContext();
709 
710         try {
711             output.openContext(context);
712         } catch (TripleHandlerException e) {
713             throw new ExtractionException(
714                     String.format("Error starting document with URI %s", documentURI),
715                     e
716             );
717         }
718 
719         try {
720             if(addDomainTriples) {
721                 addDomainTriplesPerResourceRoots(resourceRoots, context);
722             }
723             addNestingRelationship(resourceRoots, propertyPaths, context);
724         } catch (TripleHandlerException the) {
725             throw new ExtractionException("Error while writing triple triple.", the);
726         } finally {
727             try {
728                 output.closeContext(context);
729             } catch (TripleHandlerException e) {
730                 throw new ExtractionException("Error while closing context.", e);
731             }
732         }
733 
734         return context;
735     }
736 
737     /**
738      * This method consolidates the graphs extracted from the same document.
739      * In particular it adds:
740      * <ul>
741      *   <li>for every microformat root node a triple indicating the original Web page domain;</li>
742      * </ul>
743      * @param resourceRoots list of RDF nodes representing roots of
744      *        extracted microformat graphs and the corresponding HTML paths.
745      *        from which such properties have been extracted.
746      * @param addDomainTriples
747      * @param output a triple handler event collector.
748      * @return
749      * @throws ExtractionException
750      */
751     private ExtractionContext consolidateResources(
752             List<ResourceRoot> resourceRoots,
753             boolean addDomainTriples,
754             TripleHandler output
755     ) throws ExtractionException {
756         final ExtractionContext context = createExtractionContext();
757 
758         try {
759             output.openContext(context);
760         } catch (TripleHandlerException e) {
761             throw new ExtractionException(
762                     String.format("Error starting document with URI %s", documentURI),
763                     e
764             );
765         }
766 
767         try {
768             if(addDomainTriples) {
769                 addDomainTriplesPerResourceRoots(resourceRoots, context);
770             }
771         } finally {
772             try {
773                 output.closeContext(context);
774             } catch (TripleHandlerException the) {
775                 throw new ExtractionException("Error while closing context.", the);
776             }
777         }
778 
779         return context;
780     }
781 
782     /**
783      * Adds metadata triples containing the number of extracted triples
784      * and the extraction timestamp.
785      *
786      * @param context
787      * @throws TripleHandlerException
788      */
789     private void addExtractionTimeSizeMetaTriples(ExtractionContext context)
790     throws TripleHandlerException {
791         // adding extraction date
792         String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date());
793         output.receiveTriple(
794                 new URIImpl(documentURI.toString()),
795                 vSINDICE.getProperty(SINDICE.DATE),
796                 ValueFactoryImpl.getInstance().createLiteral(xsdDateTimeNow),
797                 null,
798                 context
799         );
800 
801         // adding number of extracted triples
802         int numberOfTriples = 0;
803         CompositeTripleHandler cth = (CompositeTripleHandler) output;
804         for (TripleHandler th : cth.getChilds()) {
805             if (th instanceof CountingTripleHandler) {
806                 numberOfTriples = ((CountingTripleHandler) th).getCount();
807             }
808         }
809         output.receiveTriple(
810                 new URIImpl(documentURI.toString()),
811                 vSINDICE.getProperty(SINDICE.SIZE),
812                 ValueFactoryImpl.getInstance().createLiteral(numberOfTriples + 1), // the number of triples plus itself
813                 null,
814                 context
815         );
816     }
817 
818     /**
819      * Creates a nesting relationship triple.
820      * 
821      * @param from the property containing the nested microformat.
822      * @param to the root to the nested microformat.
823      * @param th the triple handler.
824      * @param ec the extraction context used to add such information.
825      * @throws org.apache.any23.writer.TripleHandlerException
826      */
827     private void createNestingRelationship(
828             PropertyPath from,
829             ResourceRoot to,
830             TripleHandler th,
831             ExtractionContext ec
832     ) throws TripleHandlerException {
833         final BNode fromObject = from.getObject();
834         final String bNodeHash = from.getProperty().stringValue() + ( fromObject == null ? "" : fromObject.getID() );
835         BNode bnode = RDFUtils.getBNode(bNodeHash);
836         th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec );
837         th.receiveTriple(
838                 bnode,
839                 vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED),
840                 from.getObject() == null ? to.getRoot() : from.getObject(),
841                 null,
842                 ec
843         );
844         th.receiveTriple(
845                 from.getSubject(),
846                 vSINDICE.getProperty(SINDICE.NESTING),
847                 bnode,
848                 null,
849                 ec
850         );
851     }
852 
853     /**
854      * Entity detection report.
855      */
856     private class SingleExtractionReport {
857         private final Collection<IssueReport.Issue> issues;
858         private final List<ResourceRoot>            resourceRoots;
859         private final List<PropertyPath>            propertyPaths;
860 
861         public SingleExtractionReport(
862                 Collection<IssueReport.Issue>  issues,
863                 List<ResourceRoot> resourceRoots,
864                 List<PropertyPath> propertyPaths
865         ) {
866             this.issues        = issues;
867             this.resourceRoots = resourceRoots;
868             this.propertyPaths = propertyPaths;
869         }
870     }
871 
872 }