View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor;
19  
20  import org.apache.any23.configuration.Configuration;
21  import org.apache.any23.configuration.DefaultConfiguration;
22  import org.apache.any23.encoding.EncodingDetector;
23  import org.apache.any23.encoding.TikaEncodingDetector;
24  import org.apache.any23.extractor.html.DocumentReport;
25  import org.apache.any23.extractor.html.HTMLDocument;
26  import org.apache.any23.extractor.html.MicroformatExtractor;
27  import org.apache.any23.extractor.html.TagSoupParser;
28  import org.apache.any23.mime.MIMEType;
29  import org.apache.any23.mime.MIMETypeDetector;
30  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
31  import org.apache.any23.rdf.RDFUtils;
32  import org.apache.any23.source.DocumentSource;
33  import org.apache.any23.source.LocalCopyFactory;
34  import org.apache.any23.source.MemCopyFactory;
35  import org.apache.any23.validator.EmptyValidationReport;
36  import org.apache.any23.validator.ValidatorException;
37  import org.apache.any23.vocab.SINDICE;
38  import org.apache.any23.writer.CompositeTripleHandler;
39  import org.apache.any23.writer.CountingTripleHandler;
40  import org.apache.any23.writer.TripleHandler;
41  import org.apache.any23.writer.TripleHandlerException;
42  import org.apache.any23.extractor.Extractor.BlindExtractor;
43  import org.apache.any23.extractor.Extractor.ContentExtractor;
44  import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
45  import org.apache.tika.mime.MimeTypes;
46  import org.eclipse.rdf4j.model.BNode;
47  import org.eclipse.rdf4j.model.IRI;
48  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
49  import org.slf4j.Logger;
50  import org.slf4j.LoggerFactory;
51  
52  import java.io.BufferedInputStream;
53  import java.io.ByteArrayOutputStream;
54  import java.io.IOException;
55  import java.io.InputStream;
56  import java.io.PrintStream;
57  import java.net.URISyntaxException;
58  import java.nio.charset.StandardCharsets;
59  import java.util.ArrayList;
60  import java.util.Collection;
61  import java.util.Collections;
62  import java.util.Date;
63  import java.util.HashMap;
64  import java.util.List;
65  import java.util.Locale;
66  import java.util.Map;
67  import java.util.UUID;
68  import java.util.stream.Collectors;
69  
70  import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
71  import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
72  
73  /**
74   * This class acts as facade where all the extractors were called on a single document.
75   */
76  public class SingleDocumentExtraction {
77  
78      private static final SINDICE vSINDICE = SINDICE.getInstance();
79  
80      private static final Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);
81  
82      private final Configuration configuration;
83  
84      private final DocumentSource in;
85  
86      private IRI documentIRI;
87  
88      private final ExtractorGroup extractors;
89  
90      private final TripleHandler output;
91  
92      private final EncodingDetector encoderDetector;
93  
94      private LocalCopyFactory copyFactory = null;
95  
96      private DocumentSource localDocumentSource = null;
97  
98      private MIMETypeDetector detector = null;
99  
100     private ExtractorGroup matchingExtractors = null;
101 
102     private MIMEType detectedMIMEType = null;
103 
104     private DocumentReport documentReport = null;
105 
106     private ExtractionParameters tagSoupDOMRelatedParameters = null;
107 
108     private String parserEncoding = null;
109 
110     /**
111      * Builds an extractor by the specification of document source, list of extractors and output triple handler.
112      *
113      * @param configuration
114      *            configuration applied during extraction.
115      * @param in
116      *            input document source.
117      * @param extractors
118      *            list of extractors to be applied.
119      * @param output
120      *            output triple handler.
121      */
122     public SingleDocumentExtraction(Configuration configuration, DocumentSource in, ExtractorGroup extractors,
123             TripleHandler output) {
124         if (configuration == null)
125             throw new NullPointerException("configuration cannot be null.");
126         if (in == null)
127             throw new NullPointerException("in cannot be null.");
128         this.configuration = configuration;
129         this.in = in;
130         this.extractors = extractors;
131 
132         List<TripleHandler> tripleHandlers = new ArrayList<>();
133         tripleHandlers.add(output);
134         tripleHandlers.add(new CountingTripleHandler());
135         this.output = new CompositeTripleHandler(tripleHandlers);
136         this.encoderDetector = new TikaEncodingDetector();
137     }
138 
139     /**
140      * Builds an extractor by the specification of document source, extractors factory and output triple handler.
141      *
142      * @param configuration
143      *            configuration applied during extraction.
144      * @param in
145      *            input document source.
146      * @param factory
147      *            the extractors factory.
148      * @param output
149      *            output triple handler.
150      */
151     public SingleDocumentExtraction(Configuration configuration, DocumentSource in, ExtractorFactory<?> factory,
152             TripleHandler output) {
153         this(configuration, in, new ExtractorGroup(Collections.<ExtractorFactory<?>> singletonList(factory)), output);
154         this.setMIMETypeDetector(null);
155     }
156 
157     /**
158      * Builds an extractor by the specification of document source, extractors factory and output triple handler, using
159      * the {@link org.apache.any23.configuration.DefaultConfiguration}.
160      *
161      * @param in
162      *            input document source.
163      * @param factory
164      *            the extractors factory.
165      * @param output
166      *            output triple handler.
167      */
168     public SingleDocumentExtraction(DocumentSource in, ExtractorFactory<?> factory, TripleHandler output) {
169         this(DefaultConfiguration.singleton(), in,
170                 new ExtractorGroup(Collections.<ExtractorFactory<?>> singletonList(factory)), output);
171         this.setMIMETypeDetector(null);
172     }
173 
174     /**
175      * Sets the internal factory for generating the document local copy, if <code>null</code> the
176      * {@link org.apache.any23.source.MemCopyFactory} will be used.
177      *
178      * @param copyFactory
179      *            local copy factory.
180      * 
181      * @see org.apache.any23.source.DocumentSource
182      */
183     public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
184         this.copyFactory = copyFactory;
185     }
186 
187     /**
188      * Sets the internal mime type detector, if <code>null</code> mimetype detection will be skipped and all extractors
189      * will be activated.
190      *
191      * @param detector
192      *            detector instance.
193      */
194     public void setMIMETypeDetector(MIMETypeDetector detector) {
195         this.detector = detector;
196     }
197 
198     /**
199      * Triggers the execution of all the {@link Extractor} registered to this class using the specified extraction
200      * parameters.
201      *
202      * @param extractionParameters
203      *            the parameters applied to the run execution.
204      * 
205      * @return the report generated by the extraction.
206      * 
207      * @throws ExtractionException
208      *             if an error occurred during the data extraction.
209      * @throws IOException
210      *             if an error occurred during the data access.
211      */
212     public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters)
213             throws ExtractionException, IOException {
214         if (extractionParameters == null) {
215             extractionParameters = ExtractionParameters.newDefault(configuration);
216         }
217 
218         final String contextIRI = extractionParameters
219                 .getProperty(ExtractionParameters.EXTRACTION_CONTEXT_IRI_PROPERTY);
220         ensureHasLocalCopy();
221         try {
222             this.documentIRI = new Any23ValueFactoryWrapper(SimpleValueFactory.getInstance())
223                     .createIRI("?".equals(contextIRI) ? in.getDocumentIRI() : contextIRI);
224         } catch (Exception ex) {
225             throw new IllegalArgumentException("Invalid IRI: " + in.getDocumentIRI(), ex);
226         }
227         if (log.isDebugEnabled()) {
228             log.debug("Processing " + this.documentIRI);
229         }
230         filterExtractorsByMIMEType();
231 
232         if (log.isDebugEnabled()) {
233             StringBuilder sb = new StringBuilder("Extractors ");
234             for (ExtractorFactory<?> factory : matchingExtractors) {
235                 sb.append(factory.getExtractorName());
236                 sb.append(' ');
237             }
238             sb.append("match ").append(documentIRI);
239             log.debug(sb.toString());
240         }
241 
242         final List<ResourceRoot> resourceRoots = new ArrayList<>();
243         final List<PropertyPath> propertyPaths = new ArrayList<>();
244         final Map<String, Collection<IssueReport.Issue>> extractorToIssues = new HashMap<>();
245 
246         // Invoke all extractors.
247         try {
248             output.startDocument(documentIRI);
249         } catch (TripleHandlerException e) {
250             log.error(String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI));
251             throw new ExtractionException(
252                     String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), e);
253         }
254         try {
255             output.setContentLength(in.getContentLength());
256             // Create the document context.
257             final String documentLanguage;
258             try {
259                 documentLanguage = extractDocumentLanguage(extractionParameters);
260                 ArrayList<ExtractorFactory<?>> filteredList = new ArrayList<>(matchingExtractors.getNumOfExtractors());
261                 final boolean mimeTypeIsTooGeneric = isTooGeneric(detectedMIMEType);
262                 ArrayList<String> intersectionOfRdfMimetypes = null;
263                 for (ExtractorFactory<?> factory : matchingExtractors) {
264                     final Extractor<?> extractor = factory.createExtractor();
265                     final SingleExtractionReport er = runExtractor(extractionParameters, documentLanguage, extractor);
266                     // Fix for ANY23-415:
267                     if (mimeTypeIsTooGeneric) {
268                         List<String> rdfMimetypes = factory.getSupportedMIMETypes().stream()
269                                 .filter(mt -> !isTooGeneric(mt)).map(MIMEType::getFullType)
270                                 .collect(Collectors.toList());
271                         if (er.touched) {
272                             // If detected mimetype is too generic, but we find extractors matching
273                             // this mimetype that are capable of producing RDF triples from this resource,
274                             // and these extractors are also associated with more specific RDF mimetypes,
275                             // then we can simply take the intersection of these more specific mimetypes
276                             // to narrow down the generic, non-RDF mimetype to a specific RDF mimetype.
277                             if (intersectionOfRdfMimetypes == null) {
278                                 intersectionOfRdfMimetypes = new ArrayList<>(rdfMimetypes);
279                             } else {
280                                 intersectionOfRdfMimetypes.retainAll(rdfMimetypes);
281                             }
282                         } else if (!rdfMimetypes.isEmpty()) {
283                             // If detected mimetype is too generic, and this extractor matches both the
284                             // generic mimetype and a more specific mimetype, but did not produce any RDF
285                             // triples, then we can safely assume that this extractor is not actually a
286                             // match for the type of file we are parsing (e.g., a "humans.txt" file).
287                             continue;
288                         }
289                     }
290                     resourceRoots.addAll(er.resourceRoots);
291                     propertyPaths.addAll(er.propertyPaths);
292                     filteredList.add(factory);
293                     extractorToIssues.put(factory.getExtractorName(), er.issues);
294                 }
295                 matchingExtractors = new ExtractorGroup(filteredList);
296                 if (intersectionOfRdfMimetypes != null && !intersectionOfRdfMimetypes.isEmpty()) {
297                     // If the detected mimetype is a generic, non-RDF mimetype, and the intersection
298                     // of specific RDF mimetypes across all triple-producing extractors is non-empty,
299                     // simply replace the generic mimetype with a specific RDF mimetype in that intersection.
300                     detectedMIMEType = MIMEType.parse(intersectionOfRdfMimetypes.get(0));
301                 }
302             } catch (ValidatorException ve) {
303                 throw new ExtractionException("An error occurred during the validation phase.", ve);
304             }
305 
306             // Resource consolidation.
307             final boolean addDomainTriples = extractionParameters
308                     .getFlag(ExtractionParameters.METADATA_DOMAIN_PER_ENTITY_FLAG);
309             final ExtractionContext consolidationContext;
310             if (extractionParameters.getFlag(ExtractionParameters.METADATA_NESTING_FLAG)) {
311                 // Consolidation with nesting.
312                 consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output,
313                         documentLanguage);
314             } else {
315                 consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output, documentLanguage);
316             }
317 
318             // Adding time/size meta triples.
319             if (extractionParameters.getFlag(ExtractionParameters.METADATA_TIMESIZE_FLAG)) {
320                 try {
321                     addExtractionTimeSizeMetaTriples(consolidationContext);
322                 } catch (TripleHandlerException e) {
323                     throw new ExtractionException(
324                             String.format(Locale.ROOT,
325                                     "Error while adding extraction metadata triples document with IRI %s", documentIRI),
326                             e);
327                 }
328             }
329         } finally {
330             try {
331                 output.endDocument(documentIRI);
332             } catch (TripleHandlerException e) {
333                 log.error(String.format(Locale.ROOT, "Error ending document with IRI %s", documentIRI));
334                 throw new ExtractionException(
335                         String.format(Locale.ROOT, "Error ending document with IRI %s", documentIRI), e);
336             }
337         }
338 
339         return new SingleDocumentExtractionReport(
340                 documentReport == null ? EmptyValidationReport.getInstance() : documentReport.getReport(),
341                 extractorToIssues);
342     }
343 
344     private static boolean isTooGeneric(MIMEType type) {
345         if (type == null || type.isAnySubtype()) {
346             return true;
347         }
348         String mt = type.getFullType();
349         return mt.equals(MimeTypes.PLAIN_TEXT) || mt.equals(MimeTypes.OCTET_STREAM) || mt.equals(MimeTypes.XML);
350     }
351 
352     /**
353      * Triggers the execution of all the {@link Extractor} registered to this class using the <i>default</i> extraction
354      * parameters.
355      *
356      * @throws IOException
357      *             if there is an error reading input from the document source
358      * @throws ExtractionException
359      *             if there is an error duing distraction
360      * 
361      * @return the extraction report.
362      */
363     public SingleDocumentExtractionReport run() throws IOException, ExtractionException {
364         return run(ExtractionParameters.newDefault(configuration));
365     }
366 
367     /**
368      * Returns the detected mimetype for the given {@link org.apache.any23.source.DocumentSource}.
369      *
370      * @return string containing the detected mimetype.
371      * 
372      * @throws IOException
373      *             if an error occurred while accessing the data.
374      */
375     public String getDetectedMIMEType() throws IOException {
376         filterExtractorsByMIMEType();
377         return detectedMIMEType == null ? null : detectedMIMEType.toString();
378     }
379 
380     /**
381      * Check whether the given {@link org.apache.any23.source.DocumentSource} content activates of not at least an
382      * extractor.
383      *
384      * @return <code>true</code> if at least an extractor is activated, <code>false</code> otherwise.
385      * 
386      * @throws IOException
387      *             if there is an error locating matching extractors
388      */
389     public boolean hasMatchingExtractors() throws IOException {
390         filterExtractorsByMIMEType();
391         return !matchingExtractors.isEmpty();
392     }
393 
394     /**
395      * @return the list of all the activated extractors for the given {@link org.apache.any23.source.DocumentSource}.
396      */
397     @SuppressWarnings("rawtypes")
398     public List<Extractor> getMatchingExtractors() {
399         final List<Extractor> extractorsList = new ArrayList<>();
400         for (ExtractorFactory extractorFactory : matchingExtractors) {
401             extractorsList.add(extractorFactory.createExtractor());
402         }
403         return extractorsList;
404     }
405 
406     /**
407      * @return the configured parsing encoding.
408      */
409     public String getParserEncoding() {
410         if (this.parserEncoding == null) {
411             this.parserEncoding = detectEncoding();
412         }
413         return this.parserEncoding;
414     }
415 
416     /**
417      * Sets the document parser encoding.
418      *
419      * @param encoding
420      *            parser encoding.
421      */
422     public void setParserEncoding(String encoding) {
423         this.parserEncoding = encoding;
424         documentReport = null;
425     }
426 
427     /**
428      * Chech whether the given {@link org.apache.any23.source.DocumentSource} is an <b>HTML</b> document.
429      *
430      * @return <code>true</code> if the document source is an HTML document.
431      * 
432      * @throws IOException
433      *             if an error occurs while accessing data.
434      */
435     private boolean isHTMLDocument() throws IOException {
436         filterExtractorsByMIMEType();
437         return !matchingExtractors.filterByMIMEType(MIMEType.parse("text/html")).isEmpty();
438     }
439 
440     /**
441      * Extracts the document language where possible.
442      *
443      * @param extractionParameters
444      *            extraction parameters to be applied to determine the document language.
445      * 
446      * @return the document language if any, <code>null</code> otherwise.
447      * 
448      * @throws java.io.IOException
449      *             if an error occurs during the document analysis.
450      * @throws org.apache.any23.validator.ValidatorException
451      */
452     private String extractDocumentLanguage(ExtractionParameters extractionParameters)
453             throws IOException, ValidatorException {
454         if (!isHTMLDocument()) {
455             return null;
456         }
457         final HTMLDocument document;
458         try {
459             document = new HTMLDocument(getTagSoupDOM(extractionParameters).getDocument());
460         } catch (IOException ioe) {
461             log.debug("Cannot extract language from document.", ioe);
462             return null;
463         }
464         return document.getDefaultLanguage();
465     }
466 
467     /**
468      * Generates a list of extractors that can be applied to the given document.
469      *
470      * @throws IOException
471      */
472     private void filterExtractorsByMIMEType() throws IOException {
473         if (matchingExtractors != null)
474             return; // has already been run.
475 
476         if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
477             matchingExtractors = extractors;
478             return;
479         }
480         ensureHasLocalCopy();
481         // detect MIME based on the real file IRI rather than based on given base namespace
482         detectedMIMEType = detector.guessMIMEType(java.net.URI.create(in.getDocumentIRI()).getPath(),
483                 localDocumentSource.openInputStream(), MIMEType.parse(localDocumentSource.getContentType()));
484         log.debug("detected media type: " + detectedMIMEType);
485         matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
486     }
487 
488     /**
489      * Triggers the execution of a specific {@link Extractor}.
490      * 
491      * @param extractionParameters
492      *            the parameters used for the extraction.
493      * @param extractor
494      *            the {@link Extractor} to be executed.
495      * 
496      * @throws ExtractionException
497      *             if an error specific to an extractor happens.
498      * @throws IOException
499      *             if an IO error occurs during the extraction.
500      * 
501      * @return the roots of the resources that have been extracted.
502      * 
503      * @throws org.apache.any23.validator.ValidatorException
504      *             if an error occurs during validation.
505      */
506     private SingleExtractionReport runExtractor(final ExtractionParameters extractionParameters,
507             final String documentLanguage, final Extractor<?> extractor)
508             throws ExtractionException, IOException, ValidatorException {
509         if (log.isDebugEnabled()) {
510             log.debug("Running {} on {}", extractor.getDescription().getExtractorName(), documentIRI);
511         }
512         long startTime = System.currentTimeMillis();
513         final ExtractionContextExtractionContext">ExtractionContext extractionContext = new ExtractionContext(extractor.getDescription().getExtractorName(),
514                 documentIRI, documentLanguage);
515         final ExtractionResultImpl#ExtractionResultImpl">ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output);
516         try {
517             if (extractor instanceof BlindExtractor) {
518                 final BlindExtractor blindExtractor = (BlindExtractor) extractor;
519                 blindExtractor.run(extractionParameters, extractionContext, documentIRI, extractionResult);
520             } else if (extractor instanceof ContentExtractor) {
521                 ensureHasLocalCopy();
522                 final ContentExtractor contentExtractor = (ContentExtractor) extractor;
523                 contentExtractor.run(extractionParameters, extractionContext, localDocumentSource.openInputStream(),
524                         extractionResult);
525             } else if (extractor instanceof TagSoupDOMExtractor) {
526                 final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor;
527                 final DocumentReport documentReport = getTagSoupDOM(extractionParameters);
528                 tagSoupDOMExtractor.run(extractionParameters, extractionContext, documentReport.getDocument(),
529                         extractionResult);
530             } else {
531                 throw new IllegalStateException("Extractor type not supported: " + extractor.getClass());
532             }
533             return new SingleExtractionReport(extractionResult.getIssues(),
534                     new ArrayList<ResourceRoot>(extractionResult.getResourceRoots()),
535                     new ArrayList<PropertyPath>(extractionResult.getPropertyPaths()), extractionResult.wasTouched());
536         } catch (ExtractionException ex) {
537             if (log.isDebugEnabled()) {
538                 log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
539             }
540             throw ex;
541         } finally {
542             // Logging result error report.
543             if (log.isDebugEnabled() && extractionResult.hasIssues()) {
544                 ByteArrayOutputStream baos = new ByteArrayOutputStream();
545                 extractionResult.printReport(new PrintStream(baos, true, "UTF-8"));
546                 log.debug(baos.toString("UTF-8"));
547             }
548             extractionResult.close();
549 
550             long elapsed = System.currentTimeMillis() - startTime;
551             if (log.isDebugEnabled()) {
552                 log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
553             }
554         }
555     }
556 
557     /**
558      * Forces the retrieval of the document data.
559      *
560      * @throws IOException
561      */
562     private void ensureHasLocalCopy() throws IOException {
563         if (localDocumentSource != null)
564             return;
565         if (in.isLocal()) {
566             localDocumentSource = in;
567             return;
568         }
569         if (copyFactory == null) {
570             copyFactory = new MemCopyFactory();
571         }
572         localDocumentSource = copyFactory.createLocalCopy(in);
573     }
574 
575     /**
576      * Returns the DOM of the given document source (that must be an HTML stream) and the report of eventual fixes
577      * applied on it.
578      *
579      * @param extractionParameters
580      *            parameters to be used during extraction.
581      * 
582      * @return document report.
583      * 
584      * @throws IOException
585      *             if an error occurs during data access.
586      * @throws ValidatorException
587      *             if an error occurs during validation.
588      */
589     private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters)
590             throws IOException, ValidatorException {
591         if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) {
592             ensureHasLocalCopy();
593             final InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
594             is.mark(Integer.MAX_VALUE);
595             final String candidateEncoding = getParserEncoding();
596             is.reset();
597             final TagSoupParserser.html#TagSoupParser">TagSoupParser tagSoupParser = new TagSoupParser(is, documentIRI.stringValue(), candidateEncoding);
598             if (extractionParameters.isValidate()) {
599                 documentReport = tagSoupParser.getValidatedDOM(extractionParameters.isFix());
600             } else {
601                 documentReport = new DocumentReport(EmptyValidationReport.getInstance(), tagSoupParser.getDOM());
602             }
603             tagSoupDOMRelatedParameters = extractionParameters;
604         }
605         return documentReport;
606     }
607 
608     /**
609      * Detects the encoding of the local document source input stream.
610      * 
611      * @return a valid encoding value.
612      */
613     private String detectEncoding() {
614         try {
615             ensureHasLocalCopy();
616             InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
617             String encoding = this.encoderDetector.guessEncoding(is, localDocumentSource.getContentType());
618             is.close();
619             return encoding;
620         } catch (Exception e) {
621             throw new RuntimeException("An error occurred while trying to detect the input encoding.", e);
622         }
623     }
624 
625     /**
626      * This function verifies if the <i>candidateSub</i> list of strings is a prefix of <i>list</i>.
627      *
628      * @param list
629      *            a list of strings.
630      * @param candidateSub
631      *            a list of strings.
632      * 
633      * @return <code>true</code> if <i>candidateSub</i> is a sub path of <i>list</i>, <code>false</code> otherwise.
634      */
635     private boolean subPath(String[] list, String[] candidateSub) {
636         if (candidateSub.length > list.length) {
637             return false;
638         }
639         for (int i = 0; i < candidateSub.length; i++) {
640             if (!candidateSub[i].equals(list[i])) {
641                 return false;
642             }
643         }
644         return true;
645     }
646 
647     /**
648      * Adds for every resource root node a page domain triple.
649      *
650      * @param resourceRoots
651      *            list of resource roots.
652      * @param context
653      *            extraction context to produce triples.
654      * 
655      * @throws ExtractionException
656      */
657     private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context)
658             throws ExtractionException {
659         try {
660             // Add source Web domains to every resource root.
661             String domain;
662             try {
663                 domain = new java.net.URI(in.getDocumentIRI()).getHost();
664             } catch (URISyntaxException urise) {
665                 throw new IllegalArgumentException("An error occurred while extracting the host from the document IRI.",
666                         urise);
667             }
668             if (domain != null) {
669                 for (ResourceRoot resourceRoot : resourceRoots) {
670                     output.receiveTriple(resourceRoot.getRoot(), vSINDICE.getProperty(SINDICE.DOMAIN),
671                             SimpleValueFactory.getInstance().createLiteral(domain), null, context);
672                 }
673             }
674         } catch (TripleHandlerException e) {
675             throw new ExtractionException("Error while writing triple triple.", e);
676         } finally {
677             try {
678                 output.closeContext(context);
679             } catch (TripleHandlerException e) {
680                 throw new ExtractionException("Error while closing context.", e);
681             }
682         }
683     }
684 
685     /**
686      * @return an extraction context specific for consolidation triples.
687      */
688     private ExtractionContext createExtractionContext(String defaultLanguage) {
689         return new ExtractionContext("consolidation-extractor", documentIRI, defaultLanguage,
690                 UUID.randomUUID().toString());
691     }
692 
693     /**
694      * Detect the nesting relationship among different Microformats and explicit them adding connection triples.
695      *
696      * @param resourceRoots
697      * @param propertyPaths
698      * @param context
699      * 
700      * @throws TripleHandlerException
701      */
702     private void addNestingRelationship(List<ResourceRoot> resourceRoots, List<PropertyPath> propertyPaths,
703             ExtractionContext context) throws TripleHandlerException {
704         ResourceRoot currentResourceRoot;
705         PropertyPath currentPropertyPath;
706         for (int r = 0; r < resourceRoots.size(); r++) {
707             currentResourceRoot = resourceRoots.get(r);
708             for (int p = 0; p < propertyPaths.size(); p++) {
709                 currentPropertyPath = propertyPaths.get(p);
710                 Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor();
711                 Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor();
712                 // Avoid wrong nesting relationships.
713                 if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) {
714                     continue;
715                 }
716                 // Avoid self declaring relationships
717                 if (MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) {
718                     continue;
719                 }
720                 if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) {
721                     createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context);
722                 }
723             }
724         }
725     }
726 
727     /**
728      * This method consolidates the graphs extracted from the same document. In particular it adds:
729      * <ul>
730      * <li>for every microformat root node a triple indicating the original Web page domain;</li>
731      * <li>triples indicating the nesting relationship among a microformat root and property paths of other nested
732      * microformats.</li>
733      * </ul>
734      * 
735      * @param resourceRoots
736      *            list of RDF nodes representing roots of extracted microformat graphs and the corresponding HTML paths.
737      * @param propertyPaths
738      *            list of RDF nodes representing property subjects, property IRIs and the HTML paths from which such
739      *            properties have been extracted.
740      * @param addDomainTriples
741      * @param output
742      *            a triple handler event collector.
743      * 
744      * @return
745      * 
746      * @throws ExtractionException
747      */
748     private ExtractionContext consolidateResources(List<ResourceRoot> resourceRoots, List<PropertyPath> propertyPaths,
749             boolean addDomainTriples, TripleHandler output, String defaultLanguage) throws ExtractionException {
750         final ExtractionContext context = createExtractionContext(defaultLanguage);
751 
752         try {
753             output.openContext(context);
754         } catch (TripleHandlerException e) {
755             throw new ExtractionException(
756                     String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), e);
757         }
758 
759         try {
760             if (addDomainTriples) {
761                 addDomainTriplesPerResourceRoots(resourceRoots, context);
762             }
763             addNestingRelationship(resourceRoots, propertyPaths, context);
764         } catch (TripleHandlerException the) {
765             throw new ExtractionException("Error while writing triple triple.", the);
766         } finally {
767             try {
768                 output.closeContext(context);
769             } catch (TripleHandlerException e) {
770                 throw new ExtractionException("Error while closing context.", e);
771             }
772         }
773 
774         return context;
775     }
776 
777     /**
778      * This method consolidates the graphs extracted from the same document. In particular it adds:
779      * <ul>
780      * <li>for every microformat root node a triple indicating the original Web page domain;</li>
781      * </ul>
782      * 
783      * @param resourceRoots
784      *            list of RDF nodes representing roots of extracted microformat graphs and the corresponding HTML paths.
785      *            from which such properties have been extracted.
786      * @param addDomainTriples
787      * @param output
788      *            a triple handler event collector.
789      * 
790      * @return
791      * 
792      * @throws ExtractionException
793      */
794     private ExtractionContext consolidateResources(List<ResourceRoot> resourceRoots, boolean addDomainTriples,
795             TripleHandler output, String defaultLanguage) throws ExtractionException {
796         final ExtractionContext context = createExtractionContext(defaultLanguage);
797 
798         try {
799             output.openContext(context);
800         } catch (TripleHandlerException e) {
801             throw new ExtractionException(
802                     String.format(Locale.ROOT, "Error starting document with IRI %s", documentIRI), e);
803         }
804 
805         try {
806             if (addDomainTriples) {
807                 addDomainTriplesPerResourceRoots(resourceRoots, context);
808             }
809         } finally {
810             try {
811                 output.closeContext(context);
812             } catch (TripleHandlerException the) {
813                 throw new ExtractionException("Error while closing context.", the);
814             }
815         }
816 
817         return context;
818     }
819 
820     /**
821      * Adds metadata triples containing the number of extracted triples and the extraction timestamp.
822      *
823      * @param context
824      * 
825      * @throws TripleHandlerException
826      */
827     private void addExtractionTimeSizeMetaTriples(ExtractionContext context) throws TripleHandlerException {
828         // adding extraction date
829         String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date());
830         output.receiveTriple(SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
831                 vSINDICE.getProperty(SINDICE.DATE), SimpleValueFactory.getInstance().createLiteral(xsdDateTimeNow),
832                 null, context);
833 
834         // adding number of extracted triples
835         int numberOfTriples = 0;
836         CompositeTripleHandlerrg/apache/any23/writer/CompositeTripleHandler.html#CompositeTripleHandler">CompositeTripleHandler cth = (CompositeTripleHandler) output;
837         for (TripleHandler th : cth.getChilds()) {
838             if (th instanceof CountingTripleHandler) {
839                 numberOfTriples = ((CountingTripleHandler) th).getCount();
840             }
841         }
842         output.receiveTriple(SimpleValueFactory.getInstance().createIRI(documentIRI.toString()),
843                 vSINDICE.getProperty(SINDICE.SIZE), SimpleValueFactory.getInstance().createLiteral(numberOfTriples + 1), // the
844                                                                                                                          // number
845                                                                                                                          // of
846                                                                                                                          // triples
847                                                                                                                          // plus
848                                                                                                                          // itself
849                 null, context);
850     }
851 
852     /**
853      * Creates a nesting relationship triple.
854      * 
855      * @param from
856      *            the property containing the nested microformat.
857      * @param to
858      *            the root to the nested microformat.
859      * @param th
860      *            the triple handler.
861      * @param ec
862      *            the extraction context used to add such information.
863      * 
864      * @throws org.apache.any23.writer.TripleHandlerException
865      */
866     private void createNestingRelationship(PropertyPath from, ResourceRoot to, TripleHandler th, ExtractionContext ec)
867             throws TripleHandlerException {
868         final BNode fromObject = from.getObject();
869         final String bNodeHash = from.getProperty().stringValue() + (fromObject == null ? "" : fromObject.getID());
870         BNode bnode = RDFUtils.getBNode(bNodeHash);
871         th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec);
872         th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED),
873                 from.getObject() == null ? to.getRoot() : from.getObject(), null, ec);
874         th.receiveTriple(from.getSubject(), vSINDICE.getProperty(SINDICE.NESTING), bnode, null, ec);
875     }
876 
877     /**
878      * Entity detection report.
879      */
880     private static class SingleExtractionReport {
881         private final Collection<IssueReport.Issue> issues;
882         private final List<ResourceRoot> resourceRoots;
883         private final List<PropertyPath> propertyPaths;
884         private final boolean touched;
885 
886         public SingleExtractionReport(Collection<IssueReport.Issue> issues, List<ResourceRoot> resourceRoots,
887                 List<PropertyPath> propertyPaths, boolean wasTouched) {
888             this.issues = issues;
889             this.resourceRoots = resourceRoots;
890             this.propertyPaths = propertyPaths;
891             this.touched = wasTouched;
892         }
893     }
894 
895 }