This project has retired. For details please refer to its Attic page.
Any23Test xref
View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23;
19  
20  import org.apache.any23.extractor.ExtractorGroup;
21  import org.apache.any23.extractor.rdf.NTriplesExtractorFactory;
22  import org.apache.http.conn.ConnectTimeoutException;
23  import org.junit.Assert;
24  import org.apache.any23.configuration.Configuration;
25  import org.apache.any23.configuration.DefaultConfiguration;
26  import org.apache.any23.configuration.ModifiableConfiguration;
27  import org.apache.any23.extractor.ExtractionException;
28  import org.apache.any23.extractor.ExtractionParameters;
29  import org.apache.any23.extractor.Extractor;
30  import org.apache.any23.extractor.microdata.MicrodataExtractor;
31  import org.apache.any23.filter.IgnoreAccidentalRDFa;
32  import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
33  import org.apache.any23.http.DefaultHTTPClient;
34  import org.apache.any23.http.DefaultHTTPClientConfiguration;
35  import org.apache.any23.http.HTTPClient;
36  import org.apache.any23.http.HTTPClientConfiguration;
37  import org.apache.any23.source.DocumentSource;
38  import org.apache.any23.source.HTTPDocumentSource;
39  import org.apache.any23.source.StringDocumentSource;
40  import org.apache.any23.util.FileUtils;
41  import org.apache.any23.util.StreamUtils;
42  import org.apache.any23.util.StringUtils;
43  import org.apache.any23.vocab.DCTerms;
44  import org.apache.any23.writer.CompositeTripleHandler;
45  import org.apache.any23.writer.CountingTripleHandler;
46  import org.apache.any23.writer.NTriplesWriter;
47  import org.apache.any23.writer.RDFXMLWriter;
48  import org.apache.any23.writer.ReportingTripleHandler;
49  import org.apache.any23.writer.RepositoryWriter;
50  import org.apache.any23.writer.TripleHandler;
51  import org.apache.any23.writer.TripleHandlerException;
52  import org.apache.commons.io.IOUtils;
53  import org.junit.AssumptionViolatedException;
54  import org.junit.Test;
55  import org.eclipse.rdf4j.model.Statement;
56  import org.eclipse.rdf4j.repository.Repository;
57  import org.eclipse.rdf4j.repository.RepositoryConnection;
58  import org.eclipse.rdf4j.repository.RepositoryResult;
59  import org.eclipse.rdf4j.repository.sail.SailRepository;
60  import org.eclipse.rdf4j.rio.RDFParseException;
61  import org.eclipse.rdf4j.sail.memory.MemoryStore;
62  import org.slf4j.Logger;
63  import org.slf4j.LoggerFactory;
64  
65  import java.io.ByteArrayOutputStream;
66  import java.io.IOException;
67  import java.net.URISyntaxException;
68  import java.nio.charset.StandardCharsets;
69  import java.util.Collections;
70  import java.util.List;
71  import java.util.Locale;
72  
73  import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;
74  
75  /**
76   * Test case for {@link Any23} facade.
77   * 
78   * @author Davide Palmisano ( dpalmisano@gmail.com )
79   * @author Michele Mostarda ( michele.mostarda@gmail.com )
80   */
81  @SuppressWarnings("unchecked")
82  public class Any23Test extends Any23OnlineTestBase {
83  
84      private static final DCTerms vDCTERMS = DCTerms.getInstance();
85  
86      private static final String PAGE_URL = "http://bob.com";
87  
88      private static final Logger logger = LoggerFactory.getLogger(Any23Test.class);
89  
90      @Test
91      public void testTTLDetection() throws Exception {
92          assertDetection("<a> <b> <c> .", "rdf-turtle");
93      }
94  
95      @Test
96      public void testN3Detection1() throws Exception {
97          assertDetection("<Bob><brothers>(<Jim><Mark>).", "rdf-turtle");
98      }
99  
100     @Test
101     public void testN3Detection2() throws Exception {
102         assertDetection("<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .", "rdf-nt");
103     }
104 
105     @Test
106     public void testHTMLBruteForceDetection() throws Exception {
107         assertDetection("<html><body><div class=\"vcard fn\">Joe</div></body></html>");
108     }
109 
110     /**
111      * This tests the behavior of <i>Any23</i> to execute the extraction explicitly specifying the charset encoding of
112      * the input.
113      * 
114      * @throws Exception
115      *             if there is an error reading the input
116      */
117     @Test
118     public void testExplicitEncoding() throws Exception {
119         assertEncodingDetection("UTF-8", "/html/encoding-test.html", "Knud M\u00F6ller");
120     }
121 
122     /**
123      * This tests the behavior of <i>Any23</i> to perform the extraction without passing it any charset encoding. The
124      * encoding is therefore guessed using {@link org.apache.any23.encoding.TikaEncodingDetector} class.
125      * 
126      * @throws Exception
127      *             if there is an error reading the input
128      */
129     @Test
130     public void testImplicitEncoding() throws Exception {
131         assertEncodingDetection(null, // The encoding will be auto detected.
132                 "/html/encoding-test.html", "Knud M\u00F6ller");
133     }
134 
135     @Test
136     public void testRDFXMLDetectionAndExtraction() throws Exception {
137         String rdfXML = "<?xml version='1.0'?> " + "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' "
138                 + "xmlns:dc='http://purl.org/dc/elements/1.1/'>"
139                 + "<rdf:Description rdf:about='http://www.example.com'>" + "<dc:title>x</dc:title>"
140                 + "</rdf:Description>" + "</rdf:RDF>";
141         assertDetectionAndExtraction(rdfXML);
142     }
143 
144     @Test
145     public void testNTriplesDetectionAndExtraction() throws Exception {
146         String n3 = "<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"n3 . appo\" .";
147         assertDetectionAndExtraction(n3);
148     }
149 
150     @Test
151     public void testNturtleDetectionAndExtraction() throws Exception {
152         String nTurtle = "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n"
153                 + "@prefix dc: <http://purl.org/dc/elements/1.1/> .\n"
154                 + "@prefix ex: <http://example.org/stuff/1.0/> .\n" + "\n"
155                 + "<http://www.w3.org/TR/rdf-syntax-grammar>\n"
156                 + "  dc:title \"RDF/XML Syntax Specification (Revised)\" ;\n" + "  ex:editor [\n"
157                 + "    ex:fullname \"Dave Beckett\";\n" + "    ex:homePage <http://purl.org/net/dajobe/>\n" + "  ] .";
158         assertDetectionAndExtraction(nTurtle);
159     }
160 
161     /**
162      * Tests out the first code snipped used in <i>Developer Manual</i>.
163      * 
164      * @throws Exception
165      *             if there is an error reading the input
166      */
167     @Test
168     public void testDemoCodeSnippet1() throws Exception {
169         /* 1 */Any23 runner = new Any23();
170         /* 2 */final String content = "@prefix foo: <http://example.org/ns#> .   "
171                 + "@prefix : <http://other.example.org/ns#> ." + "foo:bar foo: : .                          "
172                 + ":bar : foo:bar .                           ";
173         // The second argument of StringDocumentSource() must be a valid IRI.
174         /* 3 */DocumentSource source = new StringDocumentSource(content, "http://host.com/service");
175         /* 4 */ByteArrayOutputStream out = new ByteArrayOutputStream();
176         /* 5 */TripleHandler handler = new NTriplesWriter(out);
177         try {
178             /* 6 */runner.extract(source, handler);
179         } finally {
180             /* 7 */handler.close();
181         }
182         /* 8 */String nt = out.toString("UTF-8");
183 
184         /*
185          * <http://example.org/ns#bar> <http://example.org/ns#> <http://other.example.org/ns#> .
186          * <http://other.example.org/ns#bar> <http://other.example.org/ns#> <http://example.org/ns#bar> .
187          */
188         logger.debug("nt: " + nt);
189         Assert.assertTrue(nt.length() > 0);
190     }
191 
192     /**
193      * Tests out the second code snipped used in <i>Developer Manual</i>.
194      * 
195      * @throws Exception
196      *             if there is an error reading the input
197      */
198     @Test
199     public void testDemoCodeSnippet2() throws Exception {
200         assumeOnlineAllowed();
201 
202         Any23 runner = new Any23();
203         runner.setHTTPUserAgent("apache-any23-test-user-agent");
204         HTTPClient httpClient = runner.getHTTPClient();
205         DocumentSource source = new HTTPDocumentSource(httpClient, "http://dbpedia.org/resource/Trento");
206         ByteArrayOutputStream out = new ByteArrayOutputStream();
207         TripleHandler handler = new NTriplesWriter(out);
208         try {
209             runner.extract(source, handler);
210         } finally {
211             handler.close();
212         }
213         String n3 = out.toString("UTF-8");
214 
215         /*
216          * <http://dbpedia.org/resource/Trent> <http://dbpedia.org/ontology/wikiPageDisambiguates>
217          * <http://dbpedia.org/resource/Trento> . <http://dbpedia.org/resource/Andrea_Pozzo>
218          * <http://dbpedia.org/ontology/birthPlace> <http://dbpedia.org/resource/Trento> .
219          * <http://dbpedia.org/resource/Union_for_Trentino> <http://dbpedia.org/ontology/headquarter>
220          * <http://dbpedia.org/resource/Trento> . [...]
221          */
222         logger.debug("n3: " + n3);
223         Assert.assertTrue(n3.length() > 0);
224 
225         Assert.assertTrue(n3.contains(
226                 "<http://dbpedia.org/resource/Trento> <http://dbpedia.org/property/mayor> \"Franco Ianeselli, elected 2020\"@en ."));
227     }
228 
229     /**
230      * This test checks the extraction behavior when the library is used programatically. This test is related to the
231      * issue #45, to verify the different behaviors between Maven and Ant. The behavior was related to a 2nd-level
232      * dependency introduced by Maven.
233      * 
234      * @throws org.apache.any23.extractor.ExtractionException
235      *             if there is an error running extraction logic
236      * @throws IOException
237      *             if there is an error reading the input
238      * @throws URISyntaxException
239      *             if there is an error defining input URI's
240      */
241     @Test
242     public void testProgrammaticExtraction() throws ExtractionException, IOException, URISyntaxException {
243         Any23 any23 = new Any23();
244         any23.setHTTPUserAgent("Any23-Servlet");
245         any23.setHTTPClient(new DefaultHTTPClient() {
246             @Override
247             protected int getConnectionTimeout() {
248                 return 5000;
249             }
250 
251             @Override
252             protected int getSoTimeout() {
253                 return 2000;
254             }
255         });
256         ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
257         TripleHandler handler = new NTriplesWriter(byteArrayOutputStream);
258         TripleHandler rdfWriter = new IgnoreAccidentalRDFa(handler);
259         ReportingTripleHandler reporting = new ReportingTripleHandler(rdfWriter);
260 
261         DocumentSource source = getDocumentSourceFromResource("/html/rdfa/ansa_2010-02-26_12645863.html",
262                 "http://host.com/service");
263 
264         Assert.assertTrue(any23.extract(source, reporting).hasMatchingExtractors());
265         try {
266             handler.close();
267         } catch (TripleHandlerException e) {
268             Assert.fail(e.getMessage());
269         }
270 
271         final String bufferContent = byteArrayOutputStream.toString(StandardCharsets.UTF_8);
272         logger.debug(bufferContent);
273         Assert.assertSame("Unexpected number of triples.", 18, StringUtils.countNL(bufferContent));
274 
275     }
276 
277     /**
278      * This test checks if a URL that is supposed to be GZIPPED is correctly opened and parsed with the {@link Any23}
279      * facade.
280      * 
281      * @throws org.apache.any23.extractor.ExtractionException
282      *             if there is an error running extraction logic
283      * @throws IOException
284      *             if there is an error reading the input
285      * @throws URISyntaxException
286      *             if there is an error defining input URI's
287      */
288     @Test
289     public void testGZippedContent() throws IOException, URISyntaxException, ExtractionException {
290         assumeOnlineAllowed();
291         final Any23 runner = new Any23();
292         runner.setHTTPUserAgent("apache-any23-test-user-agent");
293         DocumentSource source = new HTTPDocumentSource(runner.getHTTPClient(), "https://dev.w3.org/html5/rdfa/");
294         ByteArrayOutputStream out = new ByteArrayOutputStream();
295         TripleHandler handler = new NTriplesWriter(out);
296         try {
297             runner.extract(source, handler);
298         } catch (ConnectTimeoutException e) {
299             // This page is down as of 2019.09.14
300             logger.error("Connection to " + source.getDocumentIRI() + " timed out; skipping test", e);
301             throw new AssumptionViolatedException(e.getMessage());
302         }
303         String n3 = out.toString(StandardCharsets.UTF_8);
304         logger.debug("N3 " + n3);
305         Assert.assertTrue(n3.length() > 0);
306     }
307 
308     @Test
309     public void testExtractionParameters() throws IOException, ExtractionException, TripleHandlerException {
310         // not quite sure if following triples should be extracted
311         // ?doc <http://www.w3.org/1999/xhtml/vocab#icon> <https://any23.googlecode.com/favicon.ico> .
312         // ?doc <http://www.w3.org/1999/xhtml/vocab#stylesheet> <https://any23.googlecode.com/design/style.css> .
313 
314         final int EXPECTED_TRIPLES = 12;
315         Any23 runner = new Any23();
316         DocumentSource source = getDocumentSourceFromResource("/org/apache/any23/validator/missing-og-namespace.html",
317                 "http://www.test.com");
318 
319         ByteArrayOutputStream baos = new ByteArrayOutputStream();
320 
321         CountingTripleHandler cth1 = new CountingTripleHandler();
322         NTriplesWriter ctw1 = new NTriplesWriter(baos);
323         CompositeTripleHandler compositeTH1 = new CompositeTripleHandler();
324         compositeTH1.addChild(cth1);
325         compositeTH1.addChild(ctw1);
326         try {
327             runner.extract(new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.NONE), source,
328                     compositeTH1);
329         } finally {
330             compositeTH1.close();
331         }
332         logger.debug(baos.toString(StandardCharsets.UTF_8));
333         Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES, cth1.getCount());
334     }
335 
336     @Test
337     public void testExtractionParametersWithNestingDisabled()
338             throws IOException, ExtractionException, TripleHandlerException {
339         final int EXPECTED_TRIPLES = 20;
340         Any23 runner = new Any23();
341         DocumentSource source = getDocumentSourceFromResource("/microformats/nested-microformats-a1.html",
342                 "http://www.test.com");
343 
344         ByteArrayOutputStream baos = new ByteArrayOutputStream();
345 
346         CountingTripleHandler cth1 = new CountingTripleHandler();
347         RDFXMLWriter ctw1 = new RDFXMLWriter(baos);
348         CompositeTripleHandler compositeTH1 = new CompositeTripleHandler();
349         compositeTH1.addChild(cth1);
350         compositeTH1.addChild(ctw1);
351         runner.extract(new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.NONE, true), source,
352                 compositeTH1);
353         compositeTH1.close();
354         logger.debug("Out1: " + baos.toString(StandardCharsets.UTF_8));
355         Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES + 3, cth1.getCount());
356 
357         baos.reset();
358         CountingTripleHandler cth2 = new CountingTripleHandler();
359         NTriplesWriter ctw2 = new NTriplesWriter(baos);
360         CompositeTripleHandler compositeTH2 = new CompositeTripleHandler();
361         compositeTH2.addChild(cth2);
362         compositeTH2.addChild(ctw2);
363         runner.extract(
364                 new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.VALIDATE_AND_FIX, false),
365                 source, compositeTH2);
366         compositeTH2.close();
367         logger.debug("Out2: " + baos.toString(StandardCharsets.UTF_8));
368         Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES, cth2.getCount());
369     }
370 
371     @Test
372     public void testExceptionPropagation() throws IOException {
373         Any23 any23 = new Any23();
374         DocumentSource source = getDocumentSourceFromResource("/application/turtle/geolinkeddata.ttl",
375                 "http://www.test.com");
376         CountingTripleHandler cth1 = new CountingTripleHandler();
377         try {
378             any23.extract(source, cth1);
379         } catch (ExtractionException e) {
380             Assert.assertTrue(e.getCause() instanceof RDFParseException);
381         }
382 
383     }
384 
385     /**
386      * Test correct management of general <i>XML</i> content.
387      * 
388      * @throws org.apache.any23.extractor.ExtractionException
389      *             if there is an error running extraction logic
390      * @throws IOException
391      *             if there is an error reading the input
392      */
393     @Test
394     public void testXMLMimeTypeManagement() throws IOException, ExtractionException {
395         final String documentIRI = "http://www.test.com/resource.xml";
396         final String contentType = "application/xml";
397         final String in = StreamUtils.asString(this.getClass().getResourceAsStream("any23-xml-mimetype.xml"));
398         final DocumentSource doc = new StringDocumentSource(in, documentIRI, contentType);
399         final Any23 any23 = new Any23();
400         final CountingTripleHandler cth = new CountingTripleHandler(false);
401         final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
402         final ExtractionReport report = any23.extract(doc, rth);
403         Assert.assertFalse(report.hasMatchingExtractors());
404         Assert.assertEquals(0, cth.getCount());
405     }
406 
407     /**
408      * Test correct management of general <i>XML</i> content from <i>URL</i> source.
409      * 
410      * @throws org.apache.any23.extractor.ExtractionException
411      *             if there is an error running extraction logic
412      * @throws IOException
413      *             if there is an error reading the input
414      */
415     @Test
416     public void testXMLMimeTypeManagementViaURL() throws IOException, ExtractionException {
417         assumeOnlineAllowed();
418         final Any23 any23 = new Any23();
419         any23.setHTTPUserAgent("apache-any23-test-user-agent");
420         HTTPClient client = any23.getHTTPClient();
421         HTTPClientConfiguration configuration = new DefaultHTTPClientConfiguration("application/xml");
422         client.init(configuration);
423         final CountingTripleHandler cth = new CountingTripleHandler(false);
424         final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
425         final ExtractionReport report = any23.extract("http://www.legislation.gov.uk/ukpga/2015/17/section/4/data.xml",
426                 rth);
427         Assert.assertFalse(report.hasMatchingExtractors());
428         Assert.assertEquals(0, cth.getCount());
429     }
430 
431     @Test
432     public void testBlankNodesViaURL() throws IOException, ExtractionException {
433         assumeOnlineAllowed();
434         final Any23 any23 = new Any23();
435         any23.setHTTPUserAgent("apache-any23-test-user-agent");
436         final CountingTripleHandler cth = new CountingTripleHandler(false);
437         final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
438         final ExtractionReport report = any23.extract("https://www.w3.org/", rth);
439         Assert.assertTrue(report.hasMatchingExtractors());
440     }
441 
442     @Test
443     public void testMicrodataSupport() throws Exception {
444         final String htmlWithMicrodata = IOUtils
445                 .toString(getClass().getResourceAsStream("/microdata/microdata-basic.html"), StandardCharsets.UTF_8);
446         assertExtractorActivation(htmlWithMicrodata, MicrodataExtractor.class);
447     }
448 
449     @Test
450     public void testAbstractMethodErrorIssue186_1() throws IOException, ExtractionException {
451         final Any23 runner = new Any23();
452         final String content = FileUtils.readResourceContent("/html/rdfa/rdfa-issue186-1.xhtml");
453         final DocumentSource source = new StringDocumentSource(content, "http://base.com");
454         final ByteArrayOutputStream out = new ByteArrayOutputStream();
455         final TripleHandler handler = new NTriplesWriter(out);
456         runner.extract(source, handler);
457         String n3 = out.toString("UTF-8");
458         logger.debug(n3);
459     }
460 
461     @Test
462     public void testAbstractMethodErrorIssue186_2() throws IOException, ExtractionException {
463         final Any23 runner = new Any23();
464         final String content = FileUtils.readResourceContent("/html/rdfa/rdfa-issue186-2.xhtml");
465         final DocumentSource source = new StringDocumentSource(content, "http://richard.cyganiak.de/");
466         final ByteArrayOutputStream out = new ByteArrayOutputStream();
467         final TripleHandler handler = new NTriplesWriter(out);
468         runner.extract(source, handler);
469         final String n3 = out.toString("UTF-8");
470         logger.debug(n3);
471     }
472 
473     @Test
474     public void testModifiableConfiguration_issue183() throws Exception {
475         final ModifiableConfiguration modifiableConf = DefaultConfiguration.copy();
476         modifiableConf.setProperty("any23.extraction.metadata.timesize", "off");
477         final Any23 any23 = new Any23(modifiableConf);
478 
479         final String content = FileUtils.readResourceContent("/rdf/rdf-issue183.ttl");
480         final DocumentSource source = new StringDocumentSource(content, "http://base.com");
481         final ByteArrayOutputStream out = new ByteArrayOutputStream();
482         final TripleHandler handler = new NTriplesWriter(out);
483         any23.extract(source, handler);
484         handler.close();
485         final String n3 = out.toString("UTF-8");
486 
487         logger.debug(n3);
488         Assert.assertFalse("Should not contain triple with http://vocab.sindice.net/date",
489                 n3.contains("http://vocab.sindice.net/date"));
490         Assert.assertFalse("Should not contain triple with http://vocab.sindice.net/size",
491                 n3.contains("http://vocab.sindice.net/size"));
492     }
493 
494     @Test
495     public void testIssue415InvalidNTriples() throws Exception {
496         NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
497         Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));
498 
499         ExtractionReport report = runner.extract(IOUtils.resourceToString("/rdf/issue415.txt", StandardCharsets.UTF_8),
500                 "http://humanstxt.org/humans.txt", new CompositeTripleHandler());
501         Assert.assertEquals("text/plain", report.getDetectedMimeType());
502         Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
503         Assert.assertEquals(0, report.getMatchingExtractors().size());
504     }
505 
506     @Test
507     public void testIssue415ValidNTriples() throws Exception {
508         NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
509         Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));
510 
511         CountingTripleHandler handler = new CountingTripleHandler();
512         ExtractionReport report = runner.extract(
513                 IOUtils.resourceToString("/rdf/issue415-valid.txt", StandardCharsets.UTF_8),
514                 "http://humanstxt.org/humans.txt", handler);
515         Assert.assertEquals("application/n-triples", report.getDetectedMimeType());
516         Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
517         Assert.assertEquals(1, report.getMatchingExtractors().size());
518         Assert.assertEquals(1, handler.getCount());
519     }
520 
521     /**
522      * Performs detection and extraction on the given input string and return the {@link ExtractionReport}.
523      * 
524      * @param in
525      *            input string.
526      * 
527      * @return a populated {@link org.apache.any23.ExtractionReport}
528      * 
529      * @throws Exception
530      *             if there is an error detecting mime type and running extraction
531      */
532     private ExtractionReport detectAndExtract(String in) throws Exception {
533         Any23 any23 = new Any23();
534         Configuration conf = DefaultConfiguration.copy();
535         ByteArrayOutputStream out = new ByteArrayOutputStream();
536         ReportingTripleHandler outputHandler = new ReportingTripleHandler(
537                 new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(new NTriplesWriter(out))));
538         return any23.extract(new ExtractionParameters(conf, ValidationMode.VALIDATE_AND_FIX, null, null),
539                 new StringDocumentSource(in, "http://host.com/path"), outputHandler, "UTF-8");
540     }
541 
542     /**
543      * Asserts that a list an {@link Extractor} has been activated for the given input data.
544      * 
545      * @param in
546      *            input data as string.
547      * 
548      * @throws IOException
549      * @throws ExtractionException
550      */
551     private void assertDetectionAndExtraction(String in) throws Exception {
552         final ExtractionReport extractionReport = detectAndExtract(in);
553         Assert.assertTrue("Detection and extraction failed, no matching extractors.",
554                 extractionReport.hasMatchingExtractors());
555     }
556 
557     /**
558      * Assert the correct activation of the given list of {@link Extractor}s for the given input string.
559      * 
560      * @param in
561      *            input data as string.
562      * @param expectedExtractors
563      * 
564      * @throws IOException
565      * @throws ExtractionException
566      */
567     private void assertExtractorActivation(String in,
568             @SuppressWarnings("rawtypes") Class<? extends Extractor>... expectedExtractors) throws Exception {
569         final ExtractionReport extractionReport = detectAndExtract(in);
570         for (@SuppressWarnings("rawtypes")
571         Class<? extends Extractor> expectedExtractorClass : expectedExtractors) {
572             Assert.assertTrue(
573                     String.format(Locale.ROOT, "Detection and extraction failed, expected extractor [%s] not found.",
574                             expectedExtractorClass),
575                     containsClass(extractionReport.getMatchingExtractors(), expectedExtractorClass));
576         }
577     }
578 
579     /**
580      * Asserts the correct encoding detection for a specified data.
581      * 
582      * @param encoding
583      *            the expected specified encoding, if <code>null</code> will be auto detected.
584      * @param input
585      * @param expectedContent
586      * 
587      * @throws Exception
588      */
589     private void assertEncodingDetection(String encoding, String input, String expectedContent) throws Exception {
590         DocumentSource fileDocumentSource = getDocumentSourceFromResource(input);
591         Any23 any23;
592         RepositoryConnection conn = null;
593         RepositoryWriter repositoryWriter = null;
594 
595         any23 = new Any23();
596         Repository store = new SailRepository(new MemoryStore());
597         store.init();
598         try {
599             conn = store.getConnection();
600             repositoryWriter = new RepositoryWriter(conn);
601             Assert.assertTrue(any23.extract(fileDocumentSource, repositoryWriter, encoding).hasMatchingExtractors());
602 
603             RepositoryResult<Statement> statements = conn.getStatements(null, vDCTERMS.title, null, false);
604             try {
605                 while (statements.hasNext()) {
606                     Statement statement = statements.next();
607                     printStatement(statement);
608                     Assert.assertTrue(statement.getObject().stringValue().contains(expectedContent));
609                 }
610             } finally {
611                 statements.close();
612             }
613         } finally {
614             if (conn != null) {
615                 conn.close();
616             }
617             if (repositoryWriter != null) {
618                 repositoryWriter.close();
619             }
620         }
621         fileDocumentSource = null;
622         any23 = null;
623     }
624 
625     /**
626      * Will try to detect the <i>content</i> trying sequentially with all specified parser.
627      * 
628      * @param content
629      * @param parsers
630      * 
631      * @throws Exception
632      */
633     private void assertDetection(String content, String... parsers) throws Exception {
634         ByteArrayOutputStream out = new ByteArrayOutputStream();
635         Any23 runner = new Any23(parsers.length == 0 ? null : parsers);
636         if (parsers.length != 0) {
637             runner.setMIMETypeDetector(null); // Use all the provided
638                                               // extractors.
639         }
640         final NTriplesWriter tripleHandler = new NTriplesWriter(out);
641         runner.extract(new StringDocumentSource(content, PAGE_URL), tripleHandler);
642         tripleHandler.close();
643         String result = out.toString("us-ascii");
644         Assert.assertNotNull(result);
645         Assert.assertTrue(result.length() > 10);
646     }
647 
648     private void printStatement(Statement statement) {
649         logger.debug(String.format(Locale.ROOT, "%s\t%s\t%s", statement.getSubject(), statement.getPredicate(),
650                 statement.getObject()));
651     }
652 
653     private boolean containsClass(List<?> list, Class<?> clazz) {
654         for (Object o : list) {
655             if (o.getClass().equals(clazz)) {
656                 return true;
657             }
658         }
659         return false;
660     }
661 
662 }