View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor;
19  
20  import org.apache.any23.configuration.Configuration;
21  import org.apache.any23.configuration.DefaultConfiguration;
22  import org.apache.any23.encoding.EncodingDetector;
23  import org.apache.any23.encoding.TikaEncodingDetector;
24  import org.apache.any23.extractor.html.DocumentReport;
25  import org.apache.any23.extractor.html.HTMLDocument;
26  import org.apache.any23.extractor.html.MicroformatExtractor;
27  import org.apache.any23.extractor.html.TagSoupParser;
28  import org.apache.any23.mime.MIMEType;
29  import org.apache.any23.mime.MIMETypeDetector;
30  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
31  import org.apache.any23.rdf.RDFUtils;
32  import org.apache.any23.source.DocumentSource;
33  import org.apache.any23.source.LocalCopyFactory;
34  import org.apache.any23.source.MemCopyFactory;
35  import org.apache.any23.validator.EmptyValidationReport;
36  import org.apache.any23.validator.ValidatorException;
37  import org.apache.any23.vocab.SINDICE;
38  import org.apache.any23.writer.CompositeTripleHandler;
39  import org.apache.any23.writer.CountingTripleHandler;
40  import org.apache.any23.writer.TripleHandler;
41  import org.apache.any23.writer.TripleHandlerException;
42  import org.apache.any23.extractor.Extractor.BlindExtractor;
43  import org.apache.any23.extractor.Extractor.ContentExtractor;
44  import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
45  import org.apache.tika.mime.MimeTypes;
46  import org.eclipse.rdf4j.model.BNode;
47  import org.eclipse.rdf4j.model.IRI;
48  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
49  import org.slf4j.Logger;
50  import org.slf4j.LoggerFactory;
51  
52  import java.io.BufferedInputStream;
53  import java.io.ByteArrayOutputStream;
54  import java.io.IOException;
55  import java.io.InputStream;
56  import java.io.PrintStream;
57  import java.net.URISyntaxException;
58  import java.util.ArrayList;
59  import java.util.Collection;
60  import java.util.Collections;
61  import java.util.Date;
62  import java.util.HashMap;
63  import java.util.List;
64  import java.util.Map;
65  import java.util.UUID;
66  import java.util.stream.Collectors;
67  
68  import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
69  import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
70  
71  /**
72   * This class acts as facade where all the extractors were called on a single document.
73   */
74  public class SingleDocumentExtraction {
75  
76      private static final SINDICE vSINDICE = SINDICE.getInstance();
77  
78      private static final Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);
79  
80      private final Configuration configuration;
81  
82      private final DocumentSource in;
83  
84      private IRI documentIRI;
85      
86      private final ExtractorGroup extractors;
87  
88      private final TripleHandler output;
89  
90      private final EncodingDetector encoderDetector;
91  
92      private LocalCopyFactory copyFactory = null;
93  
94      private DocumentSource localDocumentSource = null;
95  
96      private MIMETypeDetector detector = null;
97  
98      private ExtractorGroup matchingExtractors = null;
99  
100     private MIMEType detectedMIMEType = null;
101 
102     private DocumentReport documentReport = null;
103 
104     private ExtractionParameters tagSoupDOMRelatedParameters = null;
105 
106     private String parserEncoding = null;
107 
108     /**
109      * Builds an extractor by the specification of document source,
110      * list of extractors and output triple handler.
111      *
112      * @param configuration configuration applied during extraction.
113      * @param in input document source.
114      * @param extractors list of extractors to be applied.
115      * @param output output triple handler.
116      */
117     public SingleDocumentExtraction(
118             Configuration configuration, DocumentSource in, ExtractorGroup extractors, TripleHandler output
119     ) {
120         if(configuration == null)
121             throw new NullPointerException("configuration cannot be null.");
122         if(in == null)
123             throw new NullPointerException("in cannot be null.");
124         this.configuration = configuration;
125         this.in = in;
126         this.extractors = extractors;
127 
128         List<TripleHandler> tripleHandlers = new ArrayList<>();
129         tripleHandlers.add(output);
130         tripleHandlers.add(new CountingTripleHandler());
131         this.output = new CompositeTripleHandler(tripleHandlers);
132         this.encoderDetector = new TikaEncodingDetector();
133     }
134 
135     /**
136      * Builds an extractor by the specification of document source,
137      * extractors factory and output triple handler.
138      *
139      * @param configuration configuration applied during extraction.
140      * @param in input document source.
141      * @param factory the extractors factory.
142      * @param output output triple handler.
143      */
144     public SingleDocumentExtraction(
145             Configuration configuration, DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
146     ) {
147         this(
148                 configuration,
149                 in,
150                 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
151                 output
152         );
153         this.setMIMETypeDetector(null);
154     }
155 
156     /**
157      * Builds an extractor by the specification of document source,
158      * extractors factory and output triple handler, using the
159      * {@link org.apache.any23.configuration.DefaultConfiguration}.
160      *
161      * @param in input document source.
162      * @param factory the extractors factory.
163      * @param output output triple handler.
164      */
165     public SingleDocumentExtraction(
166         DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
167     ) {
168         this(
169                 DefaultConfiguration.singleton(),
170                 in,
171                 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
172                 output
173         );
174         this.setMIMETypeDetector(null);
175     }
176 
177     /**
178      * Sets the internal factory for generating the document local copy,
179      * if <code>null</code> the {@link org.apache.any23.source.MemCopyFactory} will be used.
180      *
181      * @param copyFactory local copy factory.
182      * @see org.apache.any23.source.DocumentSource
183      */
184     public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
185         this.copyFactory = copyFactory;
186     }
187 
188     /**
189      * Sets the internal mime type detector,
190      * if <code>null</code> mimetype detection will
191      * be skipped and all extractors will be activated.
192      *
193      * @param detector detector instance.
194      */
195     public void setMIMETypeDetector(MIMETypeDetector detector) {
196         this.detector = detector;
197     }
198 
199     /**
200      * Triggers the execution of all the {@link Extractor}
201      * registered to this class using the specified extraction parameters.
202      *
203      * @param extractionParameters the parameters applied to the run execution.
204      * @return the report generated by the extraction.
205      * @throws ExtractionException if an error occurred during the data extraction.
206      * @throws IOException if an error occurred during the data access.
207      */
208     public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters)
209     throws ExtractionException, IOException {
210         if(extractionParameters == null) {
211             extractionParameters = ExtractionParameters.newDefault(configuration);
212         }
213 
214         final String contextIRI = extractionParameters.getProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY);
215         ensureHasLocalCopy();
216         try {
217             this.documentIRI = new Any23ValueFactoryWrapper(
218                     SimpleValueFactory.getInstance()
219             ).createIRI( "?".equals(contextIRI) ? in.getDocumentIRI() : contextIRI);
220         } catch (Exception ex) {
221             throw new IllegalArgumentException("Invalid IRI: " + in.getDocumentIRI(), ex);
222         }
223         if (log.isDebugEnabled()) {
224             log.debug("Processing " + this.documentIRI);
225         }
226         filterExtractorsByMIMEType();
227 
228         if(log.isDebugEnabled()) {
229             StringBuilder sb = new StringBuilder("Extractors ");
230             for (ExtractorFactory<?> factory : matchingExtractors) {
231                 sb.append(factory.getExtractorName());
232                 sb.append(' ');
233             }
234             sb.append("match ").append(documentIRI);
235             log.debug(sb.toString());
236         }
237 
238         final List<ResourceRoot> resourceRoots = new ArrayList<>();
239         final List<PropertyPath> propertyPaths = new ArrayList<>();
240         final Map<String,Collection<IssueReport.Issue>> extractorToIssues =
241             new HashMap<>();
242         
243         // Invoke all extractors.
244         try {
245             output.startDocument(documentIRI);
246         } catch (TripleHandlerException e) {
247             log.error(String.format("Error starting document with IRI %s", documentIRI));
248             throw new ExtractionException(String.format("Error starting document with IRI %s", documentIRI),
249                     e
250             );
251         }
252         try {
253 	        output.setContentLength(in.getContentLength());
254 	        // Create the document context.
255             final String documentLanguage;
256 	        try {
257 	            documentLanguage = extractDocumentLanguage(extractionParameters);
258 	            ArrayList<ExtractorFactory<?>> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
259                 final boolean mimeTypeIsTooGeneric = isTooGeneric(detectedMIMEType);
260                 ArrayList<String> intersectionOfRdfMimetypes = null;
261                 for (ExtractorFactory<?> factory : matchingExtractors) {
262 	                final Extractor<?> extractor = factory.createExtractor();
263 	                final SingleExtractionReport er = runExtractor(
264 	                        extractionParameters,
265 	                        documentLanguage,
266 	                        extractor
267 	                );
268 	                // Fix for ANY23-415:
269                     if (mimeTypeIsTooGeneric) {
270                         List<String> rdfMimetypes = factory.getSupportedMIMETypes().stream()
271                                 .filter(mt -> !isTooGeneric(mt))
272                                 .map(MIMEType::getFullType)
273                                 .collect(Collectors.toList());
274                         if (er.touched) {
275                             // If detected mimetype is too generic, but we find extractors matching
276                             // this mimetype that are capable of producing RDF triples from this resource,
277                             // and these extractors are also associated with more specific RDF mimetypes,
278                             // then we can simply take the intersection of these more specific mimetypes
279                             // to narrow down the generic, non-RDF mimetype to a specific RDF mimetype.
280                             if (intersectionOfRdfMimetypes == null) {
281                                 intersectionOfRdfMimetypes = new ArrayList<>(rdfMimetypes);
282                             } else {
283                                 intersectionOfRdfMimetypes.retainAll(rdfMimetypes);
284                             }
285                         } else if (!rdfMimetypes.isEmpty()) {
286                             // If detected mimetype is too generic, and this extractor matches both the
287                             // generic mimetype and a more specific mimetype, but did not produce any RDF
288                             // triples, then we can safely assume that this extractor is not actually a
289                             // match for the type of file we are parsing (e.g., a "humans.txt" file).
290                             continue;
291                         }
292                     }
293 	                resourceRoots.addAll( er.resourceRoots );
294 	                propertyPaths.addAll( er.propertyPaths );
295 	                filteredList.add(factory);
296 	                extractorToIssues.put(factory.getExtractorName(), er.issues);
297 	            }
298                 matchingExtractors = new ExtractorGroup(filteredList);
299                 if (intersectionOfRdfMimetypes != null && !intersectionOfRdfMimetypes.isEmpty()) {
300                     // If the detected mimetype is a generic, non-RDF mimetype, and the intersection
301                     // of specific RDF mimetypes across all triple-producing extractors is non-empty,
302                     // simply replace the generic mimetype with a specific RDF mimetype in that intersection.
303                     detectedMIMEType = MIMEType.parse(intersectionOfRdfMimetypes.get(0));
304                 }
305 	        } catch(ValidatorException ve) {
306 	            throw new ExtractionException("An error occurred during the validation phase.", ve);
307 	        }
308 	
309 	        // Resource consolidation.
310 	        final boolean addDomainTriples = extractionParameters.getFlag(ExtractionParameters.METADATA_DOMAIN_PER_ENTITY_FLAG);
311 	        final ExtractionContext consolidationContext;
312 	        if(extractionParameters.getFlag(ExtractionParameters.METADATA_NESTING_FLAG)) {
313 	            // Consolidation with nesting.
314 	            consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output, documentLanguage);
315 	        } else {
316 	            consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output, documentLanguage);
317 	        }
318 	
319 	        // Adding time/size meta triples.
320 	        if (extractionParameters.getFlag(ExtractionParameters.METADATA_TIMESIZE_FLAG)) {
321 	            try {
322 	                addExtractionTimeSizeMetaTriples(consolidationContext);
323 	            } catch (TripleHandlerException e) {
324 	                throw new ExtractionException(
325 	                        String.format(
326 	                                "Error while adding extraction metadata triples document with IRI %s", documentIRI
327 	                        ),
328 	                        e
329 	                );
330 	            }
331 	        }
332         } finally {
333 	        try {
334 	            output.endDocument(documentIRI);
335 	        } catch (TripleHandlerException e) {
336 	            log.error(String.format("Error ending document with IRI %s", documentIRI));
337 	            throw new ExtractionException(String.format("Error ending document with IRI %s", documentIRI),
338 	                    e
339 	            );
340 	        }
341         }
342 
343         return new SingleDocumentExtractionReport(
344                 documentReport == null
345                         ?
346                 EmptyValidationReport.getInstance() : documentReport.getReport(),
347                 extractorToIssues
348         );
349     }
350 
351     private static boolean isTooGeneric(MIMEType type) {
352         if (type == null || type.isAnySubtype()) {
353             return true;
354         }
355         String mt = type.getFullType();
356         return mt.equals(MimeTypes.PLAIN_TEXT)
357                 || mt.equals(MimeTypes.OCTET_STREAM)
358                 || mt.equals(MimeTypes.XML);
359     }
360 
361     /**
362      * Triggers the execution of all the {@link Extractor}
363      * registered to this class using the <i>default</i> extraction parameters.
364      *
365      * @throws IOException if there is an error reading input from the document source
366      * @throws ExtractionException if there is an error duing distraction
367      * @return the extraction report.
368      */
369     public SingleDocumentExtractionReport run() throws IOException, ExtractionException {
370         return run(ExtractionParameters.newDefault(configuration));
371     }
372 
373     /**
374      * Returns the detected mimetype for the given {@link org.apache.any23.source.DocumentSource}.
375      *
376      * @return string containing the detected mimetype.
377      * @throws IOException if an error occurred while accessing the data.
378      */
379     public String getDetectedMIMEType() throws IOException {
380         filterExtractorsByMIMEType();
381         return  detectedMIMEType == null ? null : detectedMIMEType.toString();
382     }
383 
384     /**
385      * Check whether the given {@link org.apache.any23.source.DocumentSource} content activates of not at least an extractor.
386      *
387      * @return <code>true</code> if at least an extractor is activated, <code>false</code> otherwise.
388      * @throws IOException if there is an error locating matching extractors
389      */
390     public boolean hasMatchingExtractors() throws IOException {
391         filterExtractorsByMIMEType();
392         return !matchingExtractors.isEmpty();
393     }
394 
395     /**
396      * @return the list of all the activated extractors for the given {@link org.apache.any23.source.DocumentSource}.
397      */
398     @SuppressWarnings("rawtypes")
399     public List<Extractor> getMatchingExtractors() {
400         final List<Extractor> extractorsList = new ArrayList<>();
401         for(ExtractorFactory extractorFactory : matchingExtractors) {
402             extractorsList.add( extractorFactory.createExtractor() );
403         }
404         return extractorsList;
405     }
406 
407     /**
408      * @return the configured parsing encoding.
409      */
410     public String getParserEncoding() {
411         if(this.parserEncoding == null) {
412             this.parserEncoding = detectEncoding();
413         }
414         return this.parserEncoding;
415     }
416 
417     /**
418      * Sets the document parser encoding.
419      *
420      * @param encoding parser encoding.
421      */
422     public void setParserEncoding(String encoding) {
423         this.parserEncoding = encoding;
424         documentReport = null;
425     }
426 
427     /**
428      * Chech whether the given {@link org.apache.any23.source.DocumentSource} is an <b>HTML</b> document.
429      *
430      * @return <code>true</code> if the document source is an HTML document.
431      * @throws IOException if an error occurs while accessing data.
432      */
433     private boolean isHTMLDocument() throws IOException {
434         filterExtractorsByMIMEType();
435         return ! matchingExtractors.filterByMIMEType( MIMEType.parse("text/html") ).isEmpty();
436     }
437 
438     /**
439      * Extracts the document language where possible.
440      *
441      * @param extractionParameters extraction parameters to be applied to determine the document language.
442      * @return the document language if any, <code>null</code> otherwise.
443      * @throws java.io.IOException if an error occurs during the document analysis.
444      * @throws org.apache.any23.validator.ValidatorException
445      */
446     private String extractDocumentLanguage(ExtractionParameters extractionParameters)
447     throws IOException, ValidatorException {
448         if( ! isHTMLDocument() ) {
449             return null;
450         }
451         final HTMLDocument document;
452         try {
453             document = new HTMLDocument( getTagSoupDOM(extractionParameters).getDocument() );
454         } catch (IOException ioe) {
455             log.debug("Cannot extract language from document.", ioe);
456             return null;
457         }
458         return document.getDefaultLanguage();
459     }
460 
461     /**
462      * Generates a list of extractors that can be applied to the given document.
463      *
464      * @throws IOException
465      */
466     private void filterExtractorsByMIMEType()
467     throws IOException {
468         if (matchingExtractors != null)
469             return;  // has already been run.
470 
471         if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
472             matchingExtractors = extractors;
473             return;
474         }
475         ensureHasLocalCopy();
476         // detect MIME based on the real file IRI rather than based on given base namespace
477         detectedMIMEType = detector.guessMIMEType(
478                 java.net.URI.create(in.getDocumentIRI()).getPath(),
479                 localDocumentSource.openInputStream(),
480                 MIMEType.parse(localDocumentSource.getContentType())
481         );
482         log.debug("detected media type: " + detectedMIMEType);
483         matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
484     }
485 
486     /**
487      * Triggers the execution of a specific {@link Extractor}.
488      * 
489      * @param extractionParameters the parameters used for the extraction.
490      * @param extractor the {@link Extractor} to be executed.
491      * @throws ExtractionException if an error specific to an extractor happens.
492      * @throws IOException if an IO error occurs during the extraction.
493      * @return the roots of the resources that have been extracted.
494      * @throws org.apache.any23.validator.ValidatorException if an error occurs during validation.
495      */
496     private SingleExtractionReport runExtractor(
497             final ExtractionParameters extractionParameters,
498             final String documentLanguage,
499             final Extractor<?> extractor
500     ) throws ExtractionException, IOException, ValidatorException {
501         if(log.isDebugEnabled()) {
502             log.debug("Running {} on {}", extractor.getDescription().getExtractorName(), documentIRI);
503         }
504         long startTime = System.currentTimeMillis();
505         final ExtractionContextExtractionContext">ExtractionContext extractionContext = new ExtractionContext(
506                 extractor.getDescription().getExtractorName(),
507                 documentIRI,
508                 documentLanguage
509         );
510         final ExtractionResultImpl#ExtractionResultImpl">ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output);
511         try {
512             if (extractor instanceof BlindExtractor) {
513                 final BlindExtractor blindExtractor = (BlindExtractor) extractor;
514                 blindExtractor.run(extractionParameters, extractionContext, documentIRI, extractionResult);
515             } else if (extractor instanceof ContentExtractor) {
516                 ensureHasLocalCopy();
517                 final ContentExtractor contentExtractor = (ContentExtractor) extractor;
518                 contentExtractor.run(
519                         extractionParameters,
520                         extractionContext,
521                         localDocumentSource.openInputStream(),
522                         extractionResult
523                 );
524             } else if (extractor instanceof TagSoupDOMExtractor) {
525                 final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor;
526                 final DocumentReport documentReport = getTagSoupDOM(extractionParameters);
527                 tagSoupDOMExtractor.run(
528                         extractionParameters,
529                         extractionContext,
530                         documentReport.getDocument(),
531                         extractionResult
532                 );
533             } else {
534                 throw new IllegalStateException("Extractor type not supported: " + extractor.getClass());
535             }
536             return
537                 new SingleExtractionReport(
538                     extractionResult.getIssues(),
539                     new ArrayList<ResourceRoot>( extractionResult.getResourceRoots() ),
540                     new ArrayList<PropertyPath>( extractionResult.getPropertyPaths() ),
541                     extractionResult.wasTouched()
542                 );
543         } catch (ExtractionException ex) {
544             if(log.isDebugEnabled()) {
545                 log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
546             }
547             throw ex;
548         } finally {
549             // Logging result error report.
550             if(log.isDebugEnabled() && extractionResult.hasIssues() ) {
551                 ByteArrayOutputStream baos = new ByteArrayOutputStream();
552                 extractionResult.printReport(new PrintStream(baos));
553                 log.debug(baos.toString());
554             }
555             extractionResult.close();
556 
557             long elapsed = System.currentTimeMillis() - startTime;
558             if(log.isDebugEnabled()) {
559                 log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
560             }
561         }
562     }
563 
564     /**
565      * Forces the retrieval of the document data.
566      *
567      * @throws IOException
568      */
569     private void ensureHasLocalCopy() throws IOException {
570         if (localDocumentSource != null)
571             return;
572         if (in.isLocal()) {
573             localDocumentSource = in;
574             return;
575         }
576         if (copyFactory == null) {
577             copyFactory = new MemCopyFactory();
578         }
579         localDocumentSource = copyFactory.createLocalCopy(in);
580     }
581 
582     /**
583      * Returns the DOM of the given document source (that must be an HTML stream)
584      * and the report of eventual fixes applied on it.
585      *
586      * @param extractionParameters parameters to be used during extraction.
587      * @return document report.
588      * @throws IOException if an error occurs during data access.
589      * @throws ValidatorException if an error occurs during validation.
590      */
591     private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters)
592     throws IOException, ValidatorException {
593         if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) {
594             ensureHasLocalCopy();
595             final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
596             is.mark(Integer.MAX_VALUE);
597             final String candidateEncoding = getParserEncoding();
598             is.reset();
599             final TagSoupParserser.html#TagSoupParser">TagSoupParser tagSoupParser = new TagSoupParser(
600                     is,
601                     documentIRI.stringValue(),
602                     candidateEncoding
603             );
604             if(extractionParameters.isValidate()) {
605                 documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() );
606             } else {
607                 documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() );
608             }
609             tagSoupDOMRelatedParameters = extractionParameters;
610         }
611         return documentReport;
612     }
613 
614     /**
615      * Detects the encoding of the local document source input stream.
616      * 
617      * @return a valid encoding value.
618      */
619     private String detectEncoding() {
620         try {
621             ensureHasLocalCopy();
622             InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
623             String encoding = this.encoderDetector.guessEncoding(is, localDocumentSource.getContentType());
624             is.close();
625             return encoding;
626         } catch (Exception e) {
627             throw new RuntimeException("An error occurred while trying to detect the input encoding.", e);
628         }
629     }
630 
631     /**
632      * This function verifies if the <i>candidateSub</i> list of strings
633      * is a prefix of <i>list</i>.
634      *
635      * @param list a list of strings.
636      * @param candidateSub a list of strings.
637      * @return <code>true</code> if <i>candidateSub</i> is a sub path of <i>list</i>,
638      *         <code>false</code> otherwise.
639      */
640     private boolean subPath(String[] list, String[] candidateSub) {
641         if(candidateSub.length > list.length) {
642             return false;
643         }
644         for(int i = 0; i < candidateSub.length; i++) {
645             if( ! candidateSub[i].equals(list[i])) {
646                 return false;
647             }
648         }
649         return true;
650     }
651 
652     /**
653      * Adds for every resource root node a page domain triple.
654      *
655      * @param resourceRoots list of resource roots.
656      * @param context extraction context to produce triples.
657      * @throws ExtractionException
658      */
659     private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context)
660     throws ExtractionException {
661         try {
662             // Add source Web domains to every resource root.
663             String domain;
664             try {
665                 domain = new java.net.URI(in.getDocumentIRI()).getHost();
666             } catch (URISyntaxException urise) {
667                 throw new IllegalArgumentException(
668                         "An error occurred while extracting the host from the document IRI.",
669                         urise
670                 );
671             }
672             if (domain != null) {
673                 for (ResourceRoot resourceRoot : resourceRoots) {
674                     output.receiveTriple(
675                             resourceRoot.getRoot(),
676                             vSINDICE.getProperty(SINDICE.DOMAIN),
677                             SimpleValueFactory.getInstance().createLiteral(domain),
678                             null,
679                             context
680                     );
681                 }
682             }
683         } catch (TripleHandlerException e) {
684             throw new ExtractionException("Error while writing triple triple.", e);
685         } finally {
686             try {
687                 output.closeContext(context);
688             } catch (TripleHandlerException e) {
689                 throw new ExtractionException("Error while closing context.", e);
690             }
691         }
692     }
693 
694     /**
695      * @return an extraction context specific for consolidation triples.
696      */
697     private ExtractionContext createExtractionContext(String defaultLanguage) {
698         return new ExtractionContext(
699                 "consolidation-extractor",
700                 documentIRI,
701                 defaultLanguage,
702                 UUID.randomUUID().toString()
703         );
704     }
705 
706     /**
707      * Detect the nesting relationship among different
708      * Microformats and explicit them adding connection triples.
709      *
710      * @param resourceRoots
711      * @param propertyPaths
712      * @param context
713      * @throws TripleHandlerException
714      */
715     private void addNestingRelationship(
716             List<ResourceRoot> resourceRoots,
717             List<PropertyPath> propertyPaths,
718             ExtractionContext context
719     ) throws TripleHandlerException {
720         ResourceRoot currentResourceRoot;
721         PropertyPath currentPropertyPath;
722         for (int r = 0; r < resourceRoots.size(); r++) {
723             currentResourceRoot = resourceRoots.get(r);
724             for (int p = 0; p < propertyPaths.size(); p++) {
725                 currentPropertyPath = propertyPaths.get(p);
726                 Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor();
727                 Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor();
728                 // Avoid wrong nesting relationships.
729                 if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) {
730                     continue;
731                 }
732                 // Avoid self declaring relationships
733                 if(MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) {
734                     continue;
735                 }
736                 if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) {
737                     createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context);
738                 }
739             }
740         }
741     }
742 
743     /**
744      * This method consolidates the graphs extracted from the same document.
745      * In particular it adds:
746      * <ul>
747      *   <li>for every microformat root node a triple indicating the original Web page domain;</li>
748      *   <li>triples indicating the nesting relationship among a microformat root and property paths of
749      *       other nested microformats.
750      *   </li>
751      * </ul>
752      * @param resourceRoots list of RDF nodes representing roots of
753      *        extracted microformat graphs and the corresponding HTML paths.
754      * @param propertyPaths list of RDF nodes representing property subjects, property IRIs and the HTML paths
755      *        from which such properties have been extracted. 
756      * @param addDomainTriples
757      * @param output a triple handler event collector.
758      * @return
759      * @throws ExtractionException
760      */
761     private ExtractionContext consolidateResources(
762             List<ResourceRoot> resourceRoots,
763             List<PropertyPath> propertyPaths,
764             boolean addDomainTriples,
765             TripleHandler output,
766             String defaultLanguage
767     ) throws ExtractionException {
768         final ExtractionContext context = createExtractionContext(defaultLanguage);
769 
770         try {
771             output.openContext(context);
772         } catch (TripleHandlerException e) {
773             throw new ExtractionException(
774                     String.format("Error starting document with IRI %s", documentIRI),
775                     e
776             );
777         }
778 
779         try {
780             if(addDomainTriples) {
781                 addDomainTriplesPerResourceRoots(resourceRoots, context);
782             }
783             addNestingRelationship(resourceRoots, propertyPaths, context);
784         } catch (TripleHandlerException the) {
785             throw new ExtractionException("Error while writing triple triple.", the);
786         } finally {
787             try {
788                 output.closeContext(context);
789             } catch (TripleHandlerException e) {
790                 throw new ExtractionException("Error while closing context.", e);
791             }
792         }
793 
794         return context;
795     }
796 
797     /**
798      * This method consolidates the graphs extracted from the same document.
799      * In particular it adds:
800      * <ul>
801      *   <li>for every microformat root node a triple indicating the original Web page domain;</li>
802      * </ul>
803      * @param resourceRoots list of RDF nodes representing roots of
804      *        extracted microformat graphs and the corresponding HTML paths.
805      *        from which such properties have been extracted.
806      * @param addDomainTriples
807      * @param output a triple handler event collector.
808      * @return
809      * @throws ExtractionException
810      */
811     private ExtractionContext consolidateResources(
812             List<ResourceRoot> resourceRoots,
813             boolean addDomainTriples,
814             TripleHandler output,
815             String defaultLanguage
816     ) throws ExtractionException {
817         final ExtractionContext context = createExtractionContext(defaultLanguage);
818 
819         try {
820             output.openContext(context);
821         } catch (TripleHandlerException e) {
822             throw new ExtractionException(
823                     String.format("Error starting document with IRI %s", documentIRI),
824                     e
825             );
826         }
827 
828         try {
829             if(addDomainTriples) {
830                 addDomainTriplesPerResourceRoots(resourceRoots, context);
831             }
832         } finally {
833             try {
834                 output.closeContext(context);
835             } catch (TripleHandlerException the) {
836                 throw new ExtractionException("Error while closing context.", the);
837             }
838         }
839 
840         return context;
841     }
842 
843     /**
844      * Adds metadata triples containing the number of extracted triples
845      * and the extraction timestamp.
846      *
847      * @param context
848      * @throws TripleHandlerException
849      */
850     private void addExtractionTimeSizeMetaTriples(ExtractionContext context)
851     throws TripleHandlerException {
852         // adding extraction date
853         String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date());
854         output.receiveTriple(
855                 SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
856                 vSINDICE.getProperty(SINDICE.DATE),
857                 SimpleValueFactory.getInstance().createLiteral(xsdDateTimeNow),
858                 null,
859                 context
860         );
861 
862         // adding number of extracted triples
863         int numberOfTriples = 0;
864         CompositeTripleHandlerrg/apache/any23/writer/CompositeTripleHandler.html#CompositeTripleHandler">CompositeTripleHandler cth = (CompositeTripleHandler) output;
865         for (TripleHandler th : cth.getChilds()) {
866             if (th instanceof CountingTripleHandler) {
867                 numberOfTriples = ((CountingTripleHandler) th).getCount();
868             }
869         }
870         output.receiveTriple(
871                 SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
872                 vSINDICE.getProperty(SINDICE.SIZE),
873                 SimpleValueFactory.getInstance().createLiteral(numberOfTriples + 1), // the number of triples plus itself
874                 null,
875                 context
876         );
877     }
878 
879     /**
880      * Creates a nesting relationship triple.
881      * 
882      * @param from the property containing the nested microformat.
883      * @param to the root to the nested microformat.
884      * @param th the triple handler.
885      * @param ec the extraction context used to add such information.
886      * @throws org.apache.any23.writer.TripleHandlerException
887      */
888     private void createNestingRelationship(
889             PropertyPath from,
890             ResourceRoot to,
891             TripleHandler th,
892             ExtractionContext ec
893     ) throws TripleHandlerException {
894         final BNode fromObject = from.getObject();
895         final String bNodeHash = from.getProperty().stringValue() + ( fromObject == null ? "" : fromObject.getID() );
896         BNode bnode = RDFUtils.getBNode(bNodeHash);
897         th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec );
898         th.receiveTriple(
899                 bnode,
900                 vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED),
901                 from.getObject() == null ? to.getRoot() : from.getObject(),
902                 null,
903                 ec
904         );
905         th.receiveTriple(
906                 from.getSubject(),
907                 vSINDICE.getProperty(SINDICE.NESTING),
908                 bnode,
909                 null,
910                 ec
911         );
912     }
913 
914     /**
915      * Entity detection report.
916      */
917     private static class SingleExtractionReport {
918         private final Collection<IssueReport.Issue> issues;
919         private final List<ResourceRoot>            resourceRoots;
920         private final List<PropertyPath>            propertyPaths;
921         private final boolean touched;
922 
923         public SingleExtractionReport(
924                 Collection<IssueReport.Issue>  issues,
925                 List<ResourceRoot> resourceRoots,
926                 List<PropertyPath> propertyPaths,
927                 boolean wasTouched
928         ) {
929             this.issues        = issues;
930             this.resourceRoots = resourceRoots;
931             this.propertyPaths = propertyPaths;
932             this.touched = wasTouched;
933         }
934     }
935 
936 }