1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor;
19
20 import org.apache.any23.configuration.Configuration;
21 import org.apache.any23.configuration.DefaultConfiguration;
22 import org.apache.any23.encoding.EncodingDetector;
23 import org.apache.any23.encoding.TikaEncodingDetector;
24 import org.apache.any23.extractor.html.DocumentReport;
25 import org.apache.any23.extractor.html.HTMLDocument;
26 import org.apache.any23.extractor.html.MicroformatExtractor;
27 import org.apache.any23.extractor.html.TagSoupParser;
28 import org.apache.any23.mime.MIMEType;
29 import org.apache.any23.mime.MIMETypeDetector;
30 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
31 import org.apache.any23.rdf.RDFUtils;
32 import org.apache.any23.source.DocumentSource;
33 import org.apache.any23.source.LocalCopyFactory;
34 import org.apache.any23.source.MemCopyFactory;
35 import org.apache.any23.validator.EmptyValidationReport;
36 import org.apache.any23.validator.ValidatorException;
37 import org.apache.any23.vocab.SINDICE;
38 import org.apache.any23.writer.CompositeTripleHandler;
39 import org.apache.any23.writer.CountingTripleHandler;
40 import org.apache.any23.writer.TripleHandler;
41 import org.apache.any23.writer.TripleHandlerException;
42 import org.apache.any23.extractor.Extractor.BlindExtractor;
43 import org.apache.any23.extractor.Extractor.ContentExtractor;
44 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
45 import org.openrdf.model.BNode;
46 import org.openrdf.model.URI;
47 import org.openrdf.model.impl.URIImpl;
48 import org.openrdf.model.impl.ValueFactoryImpl;
49 import org.slf4j.Logger;
50 import org.slf4j.LoggerFactory;
51
52 import java.io.BufferedInputStream;
53 import java.io.ByteArrayOutputStream;
54 import java.io.IOException;
55 import java.io.InputStream;
56 import java.io.PrintStream;
57 import java.net.URISyntaxException;
58 import java.util.ArrayList;
59 import java.util.Collection;
60 import java.util.Collections;
61 import java.util.Date;
62 import java.util.HashMap;
63 import java.util.List;
64 import java.util.Map;
65 import java.util.UUID;
66
67 import static org.apache.any23.extractor.TagSoupExtractionResult.PropertyPath;
68 import static org.apache.any23.extractor.TagSoupExtractionResult.ResourceRoot;
69
70
71
72
73 public class SingleDocumentExtraction {
74
75 private static final SINDICE vSINDICE = SINDICE.getInstance();
76
77 private final static Logger log = LoggerFactory.getLogger(SingleDocumentExtraction.class);
78
79 private final Configuration configuration;
80
81 private final DocumentSource in;
82
83 private URI documentURI;
84
85 private final ExtractorGroup extractors;
86
87 private final TripleHandler output;
88
89 private final EncodingDetector encoderDetector;
90
91 private LocalCopyFactory copyFactory = null;
92
93 private DocumentSource localDocumentSource = null;
94
95 private MIMETypeDetector detector = null;
96
97 private ExtractorGroup matchingExtractors = null;
98
99 private MIMEType detectedMIMEType = null;
100
101 private DocumentReport documentReport = null;
102
103 private ExtractionParameters tagSoupDOMRelatedParameters = null;
104
105 private String parserEncoding = null;
106
107
108
109
110
111
112
113
114
115
116 public SingleDocumentExtraction(
117 Configuration configuration, DocumentSource in, ExtractorGroup extractors, TripleHandler output
118 ) {
119 if(configuration == null) throw new NullPointerException("configuration cannot be null.");
120 if(in == null) throw new NullPointerException("in cannot be null.");
121 this.configuration = configuration;
122 this.in = in;
123 this.extractors = extractors;
124
125 List<TripleHandler> tripleHandlers = new ArrayList<TripleHandler>();
126 tripleHandlers.add(output);
127 tripleHandlers.add(new CountingTripleHandler());
128 this.output = new CompositeTripleHandler(tripleHandlers);
129 this.encoderDetector = new TikaEncodingDetector();
130 }
131
132
133
134
135
136
137
138
139
140
141 public SingleDocumentExtraction(
142 Configuration configuration, DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
143 ) {
144 this(
145 configuration,
146 in,
147 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
148 output
149 );
150 this.setMIMETypeDetector(null);
151 }
152
153
154
155
156
157
158
159
160
161
162 public SingleDocumentExtraction(
163 DocumentSource in, ExtractorFactory<?> factory, TripleHandler output
164 ) {
165 this(
166 DefaultConfiguration.singleton(),
167 in,
168 new ExtractorGroup(Collections.<ExtractorFactory<?>>singletonList(factory)),
169 output
170 );
171 this.setMIMETypeDetector(null);
172 }
173
174
175
176
177
178
179
180
181 public void setLocalCopyFactory(LocalCopyFactory copyFactory) {
182 this.copyFactory = copyFactory;
183 }
184
185
186
187
188
189
190
191
192 public void setMIMETypeDetector(MIMETypeDetector detector) {
193 this.detector = detector;
194 }
195
196
197
198
199
200
201
202
203
204
205 public SingleDocumentExtractionReport run(ExtractionParameters extractionParameters)
206 throws ExtractionException, IOException {
207 if(extractionParameters == null) {
208 extractionParameters = ExtractionParameters.newDefault(configuration);
209 }
210
211 final String contextURI = extractionParameters.getProperty(ExtractionParameters.EXTRACTION_CONTEXT_URI_PROPERTY);
212 ensureHasLocalCopy();
213 try {
214 this.documentURI = new Any23ValueFactoryWrapper(
215 ValueFactoryImpl.getInstance()
216 ).createURI( "?".equals(contextURI) ? in.getDocumentURI() : contextURI);
217 } catch (Exception ex) {
218 throw new IllegalArgumentException("Invalid URI: " + in.getDocumentURI(), ex);
219 }
220 if(log.isInfoEnabled()) {
221 log.info("Processing " + this.documentURI);
222 }
223 filterExtractorsByMIMEType();
224
225 if(log.isDebugEnabled()) {
226 StringBuffer sb = new StringBuffer("Extractors ");
227 for (ExtractorFactory<?> factory : matchingExtractors) {
228 sb.append(factory.getExtractorName());
229 sb.append(' ');
230 }
231 sb.append("match ").append(documentURI);
232 log.debug(sb.toString());
233 }
234
235
236 try {
237 output.startDocument(documentURI);
238 } catch (TripleHandlerException e) {
239 log.error(String.format("Error starting document with URI %s", documentURI));
240 throw new ExtractionException(String.format("Error starting document with URI %s", documentURI),
241 e
242 );
243 }
244 output.setContentLength(in.getContentLength());
245
246 final List<ResourceRoot> resourceRoots = new ArrayList<ResourceRoot>();
247 final List<PropertyPath> propertyPaths = new ArrayList<PropertyPath>();
248 final Map<String,Collection<IssueReport.Issue>> extractorToIssues =
249 new HashMap<String,Collection<IssueReport.Issue>>();
250 try {
251 final String documentLanguage = extractDocumentLanguage(extractionParameters);
252 for (ExtractorFactory<?> factory : matchingExtractors) {
253 final Extractor extractor = factory.createExtractor();
254 final SingleExtractionReport er = runExtractor(
255 extractionParameters,
256 documentLanguage,
257 extractor
258 );
259 resourceRoots.addAll( er.resourceRoots );
260 propertyPaths.addAll( er.propertyPaths );
261 extractorToIssues.put(factory.getExtractorName(), er.issues);
262 }
263 } catch(ValidatorException ve) {
264 throw new ExtractionException("An error occurred during the validation phase.", ve);
265 }
266
267
268 final boolean addDomainTriples = extractionParameters.getFlag(ExtractionParameters.METADATA_DOMAIN_PER_ENTITY_FLAG);
269 final ExtractionContext consolidationContext;
270 if(extractionParameters.getFlag(ExtractionParameters.METADATA_NESTING_FLAG)) {
271
272 consolidationContext = consolidateResources(resourceRoots, propertyPaths, addDomainTriples, output);
273 } else {
274 consolidationContext = consolidateResources(resourceRoots, addDomainTriples, output);
275 }
276
277
278 if (extractionParameters.getFlag(ExtractionParameters.METADATA_TIMESIZE_FLAG)) {
279 try {
280 addExtractionTimeSizeMetaTriples(consolidationContext);
281 } catch (TripleHandlerException e) {
282 throw new ExtractionException(
283 String.format(
284 "Error while adding extraction metadata triples document with URI %s", documentURI
285 ),
286 e
287 );
288 }
289 }
290
291 try {
292 output.endDocument(documentURI);
293 } catch (TripleHandlerException e) {
294 log.error(String.format("Error ending document with URI %s", documentURI));
295 throw new ExtractionException(String.format("Error ending document with URI %s", documentURI),
296 e
297 );
298 }
299
300 return new SingleDocumentExtractionReport(
301 documentReport == null
302 ?
303 EmptyValidationReport.getInstance() : documentReport.getReport(),
304 extractorToIssues
305 );
306 }
307
308
309
310
311
312
313
314
315
316 public SingleDocumentExtractionReport run() throws IOException, ExtractionException {
317 return run(ExtractionParameters.newDefault(configuration));
318 }
319
320
321
322
323
324
325
326 public String getDetectedMIMEType() throws IOException {
327 filterExtractorsByMIMEType();
328 return detectedMIMEType == null ? null : detectedMIMEType.toString();
329 }
330
331
332
333
334
335
336
337 public boolean hasMatchingExtractors() throws IOException {
338 filterExtractorsByMIMEType();
339 return !matchingExtractors.isEmpty();
340 }
341
342
343
344
345 public List<Extractor> getMatchingExtractors() {
346 final List<Extractor> extractorsList = new ArrayList<Extractor>();
347 for(ExtractorFactory extractorFactory : matchingExtractors) {
348 extractorsList.add( extractorFactory.createExtractor() );
349 }
350 return extractorsList;
351 }
352
353
354
355
356 public String getParserEncoding() {
357 if(this.parserEncoding == null) {
358 this.parserEncoding = detectEncoding();
359 }
360 return this.parserEncoding;
361 }
362
363
364
365
366
367
368 public void setParserEncoding(String encoding) {
369 this.parserEncoding = encoding;
370 documentReport = null;
371 }
372
373
374
375
376
377
378
379 private boolean isHTMLDocument() throws IOException {
380 filterExtractorsByMIMEType();
381 return ! matchingExtractors.filterByMIMEType( MIMEType.parse("text/html") ).isEmpty();
382 }
383
384
385
386
387
388
389
390
391
392 private String extractDocumentLanguage(ExtractionParameters extractionParameters)
393 throws IOException, ValidatorException {
394 if( ! isHTMLDocument() ) {
395 return null;
396 }
397 final HTMLDocument document;
398 try {
399 document = new HTMLDocument( getTagSoupDOM(extractionParameters).getDocument() );
400 } catch (IOException ioe) {
401 log.debug("Cannot extract language from document.", ioe);
402 return null;
403 }
404 return document.getDefaultLanguage();
405 }
406
407
408
409
410
411
412 private void filterExtractorsByMIMEType()
413 throws IOException {
414 if (matchingExtractors != null) return;
415
416 if (detector == null || extractors.allExtractorsSupportAllContentTypes()) {
417 matchingExtractors = extractors;
418 return;
419 }
420 ensureHasLocalCopy();
421 detectedMIMEType = detector.guessMIMEType(
422 java.net.URI.create(documentURI.stringValue()).getPath(),
423 localDocumentSource.openInputStream(),
424 MIMEType.parse(localDocumentSource.getContentType())
425 );
426 log.debug("detected media type: " + detectedMIMEType);
427 matchingExtractors = extractors.filterByMIMEType(detectedMIMEType);
428 }
429
430
431
432
433
434
435
436
437
438
439
440 private SingleExtractionReport runExtractor(
441 final ExtractionParameters extractionParameters,
442 final String documentLanguage,
443 final Extractor<?> extractor
444 ) throws ExtractionException, IOException, ValidatorException {
445 if(log.isDebugEnabled()) {
446 log.debug("Running " + extractor.getDescription().getExtractorName() + " on " + documentURI);
447 }
448 long startTime = System.currentTimeMillis();
449 final ExtractionContext extractionContext = new ExtractionContext(
450 extractor.getDescription().getExtractorName(),
451 documentURI,
452 documentLanguage
453 );
454 final ExtractionResultImpl extractionResult = new ExtractionResultImpl(extractionContext, extractor, output);
455 try {
456 if (extractor instanceof BlindExtractor) {
457 final BlindExtractor blindExtractor = (BlindExtractor) extractor;
458 blindExtractor.run(extractionParameters, extractionContext, documentURI, extractionResult);
459 } else if (extractor instanceof ContentExtractor) {
460 ensureHasLocalCopy();
461 final ContentExtractor contentExtractor = (ContentExtractor) extractor;
462 contentExtractor.run(
463 extractionParameters,
464 extractionContext,
465 localDocumentSource.openInputStream(),
466 extractionResult
467 );
468 } else if (extractor instanceof TagSoupDOMExtractor) {
469 final TagSoupDOMExtractor tagSoupDOMExtractor = (TagSoupDOMExtractor) extractor;
470 final DocumentReport documentReport = getTagSoupDOM(extractionParameters);
471 tagSoupDOMExtractor.run(
472 extractionParameters,
473 extractionContext,
474 documentReport.getDocument(),
475 extractionResult
476 );
477 } else {
478 throw new IllegalStateException("Extractor type not supported: " + extractor.getClass());
479 }
480 return
481 new SingleExtractionReport(
482 extractionResult.getIssues(),
483 new ArrayList<ResourceRoot>( extractionResult.getResourceRoots() ),
484 new ArrayList<PropertyPath>( extractionResult.getPropertyPaths() )
485 );
486 } catch (ExtractionException ex) {
487 if(log.isDebugEnabled()) {
488 log.debug(extractor.getDescription().getExtractorName() + ": " + ex.getMessage());
489 }
490 throw ex;
491 } finally {
492
493 if(log.isDebugEnabled() && extractionResult.hasIssues() ) {
494 ByteArrayOutputStream baos = new ByteArrayOutputStream();
495 extractionResult.printReport(new PrintStream(baos));
496 log.debug(baos.toString());
497 }
498 extractionResult.close();
499
500 long elapsed = System.currentTimeMillis() - startTime;
501 if(log.isDebugEnabled()) {
502 log.debug("Completed " + extractor.getDescription().getExtractorName() + ", " + elapsed + "ms");
503 }
504 }
505 }
506
507
508
509
510
511
512 private void ensureHasLocalCopy() throws IOException {
513 if (localDocumentSource != null) return;
514 if (in.isLocal()) {
515 localDocumentSource = in;
516 return;
517 }
518 if (copyFactory == null) {
519 copyFactory = new MemCopyFactory();
520 }
521 localDocumentSource = copyFactory.createLocalCopy(in);
522 }
523
524
525
526
527
528
529
530
531
532
533 private DocumentReport getTagSoupDOM(ExtractionParameters extractionParameters)
534 throws IOException, ValidatorException {
535 if (documentReport == null || !extractionParameters.equals(tagSoupDOMRelatedParameters)) {
536 ensureHasLocalCopy();
537 final InputStream is = new BufferedInputStream( localDocumentSource.openInputStream() );
538 is.mark(Integer.MAX_VALUE);
539 final String candidateEncoding = getParserEncoding();
540 is.reset();
541 final TagSoupParser tagSoupParser = new TagSoupParser(
542 is,
543 documentURI.stringValue(),
544 candidateEncoding
545 );
546 if(extractionParameters.isValidate()) {
547 documentReport = tagSoupParser.getValidatedDOM( extractionParameters.isFix() );
548 } else {
549 documentReport = new DocumentReport( EmptyValidationReport.getInstance(), tagSoupParser.getDOM() );
550 }
551 tagSoupDOMRelatedParameters = extractionParameters;
552 }
553 return documentReport;
554 }
555
556
557
558
559
560
561 private String detectEncoding() {
562 try {
563 ensureHasLocalCopy();
564 InputStream is = new BufferedInputStream(localDocumentSource.openInputStream());
565 String encoding = this.encoderDetector.guessEncoding(is);
566 is.close();
567 return encoding;
568 } catch (Exception e) {
569 throw new RuntimeException("An error occurred while trying to detect the input encoding.", e);
570 }
571 }
572
573
574
575
576
577
578
579
580
581
582 private boolean subPath(String[] list, String[] candidateSub) {
583 if(candidateSub.length > list.length) {
584 return false;
585 }
586 for(int i = 0; i < candidateSub.length; i++) {
587 if( ! candidateSub[i].equals(list[i])) {
588 return false;
589 }
590 }
591 return true;
592 }
593
594
595
596
597
598
599
600
601 private void addDomainTriplesPerResourceRoots(List<ResourceRoot> resourceRoots, ExtractionContext context)
602 throws ExtractionException {
603 try {
604
605 String domain;
606 try {
607 domain = new java.net.URI(in.getDocumentURI()).getHost();
608 } catch (URISyntaxException urise) {
609 throw new IllegalArgumentException(
610 "An error occurred while extracting the host from the document URI.",
611 urise
612 );
613 }
614 if (domain != null) {
615 for (ResourceRoot resourceRoot : resourceRoots) {
616 output.receiveTriple(
617 resourceRoot.getRoot(),
618 vSINDICE.getProperty(SINDICE.DOMAIN),
619 ValueFactoryImpl.getInstance().createLiteral(domain),
620 null,
621 context
622 );
623 }
624 }
625 } catch (TripleHandlerException e) {
626 throw new ExtractionException("Error while writing triple triple.", e);
627 } finally {
628 try {
629 output.closeContext(context);
630 } catch (TripleHandlerException e) {
631 throw new ExtractionException("Error while closing context.", e);
632 }
633 }
634 }
635
636
637
638
639 private ExtractionContext createExtractionContext() {
640 return new ExtractionContext(
641 "consolidation-extractor",
642 documentURI,
643 UUID.randomUUID().toString()
644 );
645 }
646
647
648
649
650
651
652
653
654
655
656 private void addNestingRelationship(
657 List<ResourceRoot> resourceRoots,
658 List<PropertyPath> propertyPaths,
659 ExtractionContext context
660 ) throws TripleHandlerException {
661 ResourceRoot currentResourceRoot;
662 PropertyPath currentPropertyPath;
663 for (int r = 0; r < resourceRoots.size(); r++) {
664 currentResourceRoot = resourceRoots.get(r);
665 for (int p = 0; p < propertyPaths.size(); p++) {
666 currentPropertyPath = propertyPaths.get(p);
667 Class<? extends MicroformatExtractor> currentResourceRootExtractor = currentResourceRoot.getExtractor();
668 Class<? extends MicroformatExtractor> currentPropertyPathExtractor = currentPropertyPath.getExtractor();
669
670 if (currentResourceRootExtractor.equals(currentPropertyPathExtractor)) {
671 continue;
672 }
673
674 if(MicroformatExtractor.includes(currentPropertyPathExtractor, currentResourceRootExtractor)) {
675 continue;
676 }
677 if (subPath(currentResourceRoot.getPath(), currentPropertyPath.getPath())) {
678 createNestingRelationship(currentPropertyPath, currentResourceRoot, output, context);
679 }
680 }
681 }
682 }
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702 private ExtractionContext consolidateResources(
703 List<ResourceRoot> resourceRoots,
704 List<PropertyPath> propertyPaths,
705 boolean addDomainTriples,
706 TripleHandler output
707 ) throws ExtractionException {
708 final ExtractionContext context = createExtractionContext();
709
710 try {
711 output.openContext(context);
712 } catch (TripleHandlerException e) {
713 throw new ExtractionException(
714 String.format("Error starting document with URI %s", documentURI),
715 e
716 );
717 }
718
719 try {
720 if(addDomainTriples) {
721 addDomainTriplesPerResourceRoots(resourceRoots, context);
722 }
723 addNestingRelationship(resourceRoots, propertyPaths, context);
724 } catch (TripleHandlerException the) {
725 throw new ExtractionException("Error while writing triple triple.", the);
726 } finally {
727 try {
728 output.closeContext(context);
729 } catch (TripleHandlerException e) {
730 throw new ExtractionException("Error while closing context.", e);
731 }
732 }
733
734 return context;
735 }
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751 private ExtractionContext consolidateResources(
752 List<ResourceRoot> resourceRoots,
753 boolean addDomainTriples,
754 TripleHandler output
755 ) throws ExtractionException {
756 final ExtractionContext context = createExtractionContext();
757
758 try {
759 output.openContext(context);
760 } catch (TripleHandlerException e) {
761 throw new ExtractionException(
762 String.format("Error starting document with URI %s", documentURI),
763 e
764 );
765 }
766
767 try {
768 if(addDomainTriples) {
769 addDomainTriplesPerResourceRoots(resourceRoots, context);
770 }
771 } finally {
772 try {
773 output.closeContext(context);
774 } catch (TripleHandlerException the) {
775 throw new ExtractionException("Error while closing context.", the);
776 }
777 }
778
779 return context;
780 }
781
782
783
784
785
786
787
788
789 private void addExtractionTimeSizeMetaTriples(ExtractionContext context)
790 throws TripleHandlerException {
791
792 String xsdDateTimeNow = RDFUtils.toXSDDateTime(new Date());
793 output.receiveTriple(
794 new URIImpl(documentURI.toString()),
795 vSINDICE.getProperty(SINDICE.DATE),
796 ValueFactoryImpl.getInstance().createLiteral(xsdDateTimeNow),
797 null,
798 context
799 );
800
801
802 int numberOfTriples = 0;
803 CompositeTripleHandler cth = (CompositeTripleHandler) output;
804 for (TripleHandler th : cth.getChilds()) {
805 if (th instanceof CountingTripleHandler) {
806 numberOfTriples = ((CountingTripleHandler) th).getCount();
807 }
808 }
809 output.receiveTriple(
810 new URIImpl(documentURI.toString()),
811 vSINDICE.getProperty(SINDICE.SIZE),
812 ValueFactoryImpl.getInstance().createLiteral(numberOfTriples + 1),
813 null,
814 context
815 );
816 }
817
818
819
820
821
822
823
824
825
826
827 private void createNestingRelationship(
828 PropertyPath from,
829 ResourceRoot to,
830 TripleHandler th,
831 ExtractionContext ec
832 ) throws TripleHandlerException {
833 final BNode fromObject = from.getObject();
834 final String bNodeHash = from.getProperty().stringValue() + ( fromObject == null ? "" : fromObject.getID() );
835 BNode bnode = RDFUtils.getBNode(bNodeHash);
836 th.receiveTriple(bnode, vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), from.getProperty(), null, ec );
837 th.receiveTriple(
838 bnode,
839 vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED),
840 from.getObject() == null ? to.getRoot() : from.getObject(),
841 null,
842 ec
843 );
844 th.receiveTriple(
845 from.getSubject(),
846 vSINDICE.getProperty(SINDICE.NESTING),
847 bnode,
848 null,
849 ec
850 );
851 }
852
853
854
855
856 private class SingleExtractionReport {
857 private final Collection<IssueReport.Issue> issues;
858 private final List<ResourceRoot> resourceRoots;
859 private final List<PropertyPath> propertyPaths;
860
861 public SingleExtractionReport(
862 Collection<IssueReport.Issue> issues,
863 List<ResourceRoot> resourceRoots,
864 List<PropertyPath> propertyPaths
865 ) {
866 this.issues = issues;
867 this.resourceRoots = resourceRoots;
868 this.propertyPaths = propertyPaths;
869 }
870 }
871
872 }