Any23Test

/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *  http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.any23;

import org.apache.any23.extractor.ExtractorGroup;
import org.apache.any23.extractor.rdf.NTriplesExtractorFactory;
import org.apache.http.conn.ConnectTimeoutException;
import org.junit.Assert;
import org.apache.any23.configuration.Configuration;
import org.apache.any23.configuration.DefaultConfiguration;
import org.apache.any23.configuration.ModifiableConfiguration;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.extractor.ExtractionParameters;
import org.apache.any23.extractor.Extractor;
import org.apache.any23.extractor.microdata.MicrodataExtractor;
import org.apache.any23.filter.IgnoreAccidentalRDFa;
import org.apache.any23.filter.IgnoreTitlesOfEmptyDocuments;
import org.apache.any23.http.DefaultHTTPClient;
import org.apache.any23.http.DefaultHTTPClientConfiguration;
import org.apache.any23.http.HTTPClient;
import org.apache.any23.http.HTTPClientConfiguration;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.source.HTTPDocumentSource;
import org.apache.any23.source.StringDocumentSource;
import org.apache.any23.util.FileUtils;
import org.apache.any23.util.StreamUtils;
import org.apache.any23.util.StringUtils;
import org.apache.any23.vocab.DCTerms;
import org.apache.any23.writer.CompositeTripleHandler;
import org.apache.any23.writer.CountingTripleHandler;
import org.apache.any23.writer.NTriplesWriter;
import org.apache.any23.writer.RDFXMLWriter;
import org.apache.any23.writer.ReportingTripleHandler;
import org.apache.any23.writer.RepositoryWriter;
import org.apache.any23.writer.TripleHandler;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.commons.io.IOUtils;
import org.junit.AssumptionViolatedException;
import org.junit.Test;
import org.eclipse.rdf4j.model.Statement;
import org.eclipse.rdf4j.repository.Repository;
import org.eclipse.rdf4j.repository.RepositoryConnection;
import org.eclipse.rdf4j.repository.RepositoryResult;
import org.eclipse.rdf4j.repository.sail.SailRepository;
import org.eclipse.rdf4j.rio.RDFParseException;
import org.eclipse.rdf4j.sail.memory.MemoryStore;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.net.URISyntaxException;
import java.nio.charset.StandardCharsets;
import java.util.Collections;
import java.util.List;
import java.util.Locale;

import static org.apache.any23.extractor.ExtractionParameters.ValidationMode;

/**
 * Test case for {@link Any23} facade.
 *
 * @author Davide Palmisano ( dpalmisano@gmail.com )
 * @author Michele Mostarda ( michele.mostarda@gmail.com )
 */
@SuppressWarnings("unchecked")
public class Any23Test extends Any23OnlineTestBase {

    private static final DCTerms vDCTERMS = DCTerms.getInstance();

    private static final String PAGE_URL = "http://bob.com";

    private static final Logger logger = LoggerFactory.getLogger(Any23Test.class);

    @Test
    public void testTTLDetection() throws Exception {
        assertDetection("<a> <b> <c> .", "rdf-turtle");
    }

    @Test
    public void testN3Detection1() throws Exception {
        assertDetection("<Bob><brothers>(<Jim><Mark>).", "rdf-turtle");
    }

    @Test
    public void testN3Detection2() throws Exception {
        assertDetection("<http://example.org/path> <http://foo.com> <http://example.org/Document/foo#> .", "rdf-nt");
    }

    @Test
    public void testHTMLBruteForceDetection() throws Exception {
        assertDetection("<html><body><div class=\"vcard fn\">Joe</div></body></html>");
    }

    /**
     * This tests the behavior of <i>Any23</i> to execute the extraction explicitly specifying the charset encoding of
     * the input.
     *
     * @throws Exception
     *             if there is an error reading the input
     */
    @Test
    public void testExplicitEncoding() throws Exception {
        assertEncodingDetection("UTF-8", "/html/encoding-test.html", "Knud M\u00F6ller");
    }

    /**
     * This tests the behavior of <i>Any23</i> to perform the extraction without passing it any charset encoding. The
     * encoding is therefore guessed using {@link org.apache.any23.encoding.TikaEncodingDetector} class.
     *
     * @throws Exception
     *             if there is an error reading the input
     */
    @Test
    public void testImplicitEncoding() throws Exception {
        assertEncodingDetection(null, // The encoding will be auto detected.
                "/html/encoding-test.html", "Knud M\u00F6ller");
    }

    @Test
    public void testRDFXMLDetectionAndExtraction() throws Exception {
        String rdfXML = "<?xml version='1.0'?> " + "<rdf:RDF xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' "
                + "xmlns:dc='http://purl.org/dc/elements/1.1/'>"
                + "<rdf:Description rdf:about='http://www.example.com'>" + "<dc:title>x</dc:title>"
                + "</rdf:Description>" + "</rdf:RDF>";
        assertDetectionAndExtraction(rdfXML);
    }

    @Test
    public void testNTriplesDetectionAndExtraction() throws Exception {
        String n3 = "<http://www.example.com> <http://purl.org/dc/elements/1.1/title> \"n3 . appo\" .";
        assertDetectionAndExtraction(n3);
    }

    @Test
    public void testNturtleDetectionAndExtraction() throws Exception {
        String nTurtle = "@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .\n"
                + "@prefix dc: <http://purl.org/dc/elements/1.1/> .\n"
                + "@prefix ex: <http://example.org/stuff/1.0/> .\n" + "\n"
                + "<http://www.w3.org/TR/rdf-syntax-grammar>\n"
                + "  dc:title \"RDF/XML Syntax Specification (Revised)\" ;\n" + "  ex:editor [\n"
                + "    ex:fullname \"Dave Beckett\";\n" + "    ex:homePage <http://purl.org/net/dajobe/>\n" + "  ] .";
        assertDetectionAndExtraction(nTurtle);
    }

    /**
     * Tests out the first code snipped used in <i>Developer Manual</i>.
     *
     * @throws Exception
     *             if there is an error reading the input
     */
    @Test
    public void testDemoCodeSnippet1() throws Exception {
        /* 1 */Any23 runner = new Any23();
        /* 2 */final String content = "@prefix foo: <http://example.org/ns#> .   "
                + "@prefix : <http://other.example.org/ns#> ." + "foo:bar foo: : .                          "
                + ":bar : foo:bar .                           ";
        // The second argument of StringDocumentSource() must be a valid IRI.
        /* 3 */DocumentSource source = new StringDocumentSource(content, "http://host.com/service");
        /* 4 */ByteArrayOutputStream out = new ByteArrayOutputStream();
        /* 5 */TripleHandler handler = new NTriplesWriter(out);
        try {
            /* 6 */runner.extract(source, handler);
        } finally {
            /* 7 */handler.close();
        }
        /* 8 */String nt = out.toString("UTF-8");

        /*
         * <http://example.org/ns#bar> <http://example.org/ns#> <http://other.example.org/ns#> .
         * <http://other.example.org/ns#bar> <http://other.example.org/ns#> <http://example.org/ns#bar> .
         */
        logger.debug("nt: " + nt);
        Assert.assertTrue(nt.length() > 0);
    }

    /**
     * Tests out the second code snipped used in <i>Developer Manual</i>.
     *
     * @throws Exception
     *             if there is an error reading the input
     */
    @Test
    public void testDemoCodeSnippet2() throws Exception {
        assumeOnlineAllowed();

        Any23 runner = new Any23();
        runner.setHTTPUserAgent("apache-any23-test-user-agent");
        HTTPClient httpClient = runner.getHTTPClient();
        DocumentSource source = new HTTPDocumentSource(httpClient, "http://dbpedia.org/resource/Trento");
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        TripleHandler handler = new NTriplesWriter(out);
        try {
            runner.extract(source, handler);
        } finally {
            handler.close();
        }
        String n3 = out.toString("UTF-8");

        /*
         * <http://dbpedia.org/resource/Trent> <http://dbpedia.org/ontology/wikiPageDisambiguates>
         * <http://dbpedia.org/resource/Trento> . <http://dbpedia.org/resource/Andrea_Pozzo>
         * <http://dbpedia.org/ontology/birthPlace> <http://dbpedia.org/resource/Trento> .
         * <http://dbpedia.org/resource/Union_for_Trentino> <http://dbpedia.org/ontology/headquarter>
         * <http://dbpedia.org/resource/Trento> . [...]
         */
        logger.debug("n3: " + n3);
        Assert.assertTrue(n3.length() > 0);

        Assert.assertTrue(n3.contains(
                "<http://dbpedia.org/resource/Trento> <http://dbpedia.org/property/mayor> \"Franco Ianeselli, elected 2020\"@en ."));
    }

    /**
     * This test checks the extraction behavior when the library is used programatically. This test is related to the
     * issue #45, to verify the different behaviors between Maven and Ant. The behavior was related to a 2nd-level
     * dependency introduced by Maven.
     *
     * @throws org.apache.any23.extractor.ExtractionException
     *             if there is an error running extraction logic
     * @throws IOException
     *             if there is an error reading the input
     * @throws URISyntaxException
     *             if there is an error defining input URI's
     */
    @Test
    public void testProgrammaticExtraction() throws ExtractionException, IOException, URISyntaxException {
        Any23 any23 = new Any23();
        any23.setHTTPUserAgent("Any23-Servlet");
        any23.setHTTPClient(new DefaultHTTPClient() {
            @Override
            protected int getConnectionTimeout() {
                return 5000;
            }

            @Override
            protected int getSoTimeout() {
                return 2000;
            }
        });
        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
        TripleHandler handler = new NTriplesWriter(byteArrayOutputStream);
        TripleHandler rdfWriter = new IgnoreAccidentalRDFa(handler);
        ReportingTripleHandler reporting = new ReportingTripleHandler(rdfWriter);

        DocumentSource source = getDocumentSourceFromResource("/html/rdfa/ansa_2010-02-26_12645863.html",
                "http://host.com/service");

        Assert.assertTrue(any23.extract(source, reporting).hasMatchingExtractors());
        try {
            handler.close();
        } catch (TripleHandlerException e) {
            Assert.fail(e.getMessage());
        }

        final String bufferContent = byteArrayOutputStream.toString(StandardCharsets.UTF_8);
        logger.debug(bufferContent);
        Assert.assertSame("Unexpected number of triples.", 18, StringUtils.countNL(bufferContent));

    }

    /**
     * This test checks if a URL that is supposed to be GZIPPED is correctly opened and parsed with the {@link Any23}
     * facade.
     *
     * @throws org.apache.any23.extractor.ExtractionException
     *             if there is an error running extraction logic
     * @throws IOException
     *             if there is an error reading the input
     * @throws URISyntaxException
     *             if there is an error defining input URI's
     */
    @Test
    public void testGZippedContent() throws IOException, URISyntaxException, ExtractionException {
        assumeOnlineAllowed();
        final Any23 runner = new Any23();
        runner.setHTTPUserAgent("apache-any23-test-user-agent");
        DocumentSource source = new HTTPDocumentSource(runner.getHTTPClient(), "https://dev.w3.org/html5/rdfa/");
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        TripleHandler handler = new NTriplesWriter(out);
        try {
            runner.extract(source, handler);
        } catch (ConnectTimeoutException e) {
            // This page is down as of 2019.09.14
            logger.error("Connection to " + source.getDocumentIRI() + " timed out; skipping test", e);
            throw new AssumptionViolatedException(e.getMessage());
        }
        String n3 = out.toString(StandardCharsets.UTF_8);
        logger.debug("N3 " + n3);
        Assert.assertTrue(n3.length() > 0);
    }

    @Test
    public void testExtractionParameters() throws IOException, ExtractionException, TripleHandlerException {
        // not quite sure if following triples should be extracted
        // ?doc <http://www.w3.org/1999/xhtml/vocab#icon> <https://any23.googlecode.com/favicon.ico> .
        // ?doc <http://www.w3.org/1999/xhtml/vocab#stylesheet> <https://any23.googlecode.com/design/style.css> .

        final int EXPECTED_TRIPLES = 12;
        Any23 runner = new Any23();
        DocumentSource source = getDocumentSourceFromResource("/org/apache/any23/validator/missing-og-namespace.html",
                "http://www.test.com");

        ByteArrayOutputStream baos = new ByteArrayOutputStream();

        CountingTripleHandler cth1 = new CountingTripleHandler();
        NTriplesWriter ctw1 = new NTriplesWriter(baos);
        CompositeTripleHandler compositeTH1 = new CompositeTripleHandler();
        compositeTH1.addChild(cth1);
        compositeTH1.addChild(ctw1);
        try {
            runner.extract(new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.NONE), source,
                    compositeTH1);
        } finally {
            compositeTH1.close();
        }
        logger.debug(baos.toString(StandardCharsets.UTF_8));
        Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES, cth1.getCount());
    }

    @Test
    public void testExtractionParametersWithNestingDisabled()
            throws IOException, ExtractionException, TripleHandlerException {
        final int EXPECTED_TRIPLES = 20;
        Any23 runner = new Any23();
        DocumentSource source = getDocumentSourceFromResource("/microformats/nested-microformats-a1.html",
                "http://www.test.com");

        ByteArrayOutputStream baos = new ByteArrayOutputStream();

        CountingTripleHandler cth1 = new CountingTripleHandler();
        RDFXMLWriter ctw1 = new RDFXMLWriter(baos);
        CompositeTripleHandler compositeTH1 = new CompositeTripleHandler();
        compositeTH1.addChild(cth1);
        compositeTH1.addChild(ctw1);
        runner.extract(new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.NONE, true), source,
                compositeTH1);
        compositeTH1.close();
        logger.debug("Out1: " + baos.toString(StandardCharsets.UTF_8));
        Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES + 3, cth1.getCount());

        baos.reset();
        CountingTripleHandler cth2 = new CountingTripleHandler();
        NTriplesWriter ctw2 = new NTriplesWriter(baos);
        CompositeTripleHandler compositeTH2 = new CompositeTripleHandler();
        compositeTH2.addChild(cth2);
        compositeTH2.addChild(ctw2);
        runner.extract(
                new ExtractionParameters(DefaultConfiguration.singleton(), ValidationMode.VALIDATE_AND_FIX, false),
                source, compositeTH2);
        compositeTH2.close();
        logger.debug("Out2: " + baos.toString(StandardCharsets.UTF_8));
        Assert.assertEquals("Unexpected number of triples.", EXPECTED_TRIPLES, cth2.getCount());
    }

    @Test
    public void testExceptionPropagation() throws IOException {
        Any23 any23 = new Any23();
        DocumentSource source = getDocumentSourceFromResource("/application/turtle/geolinkeddata.ttl",
                "http://www.test.com");
        CountingTripleHandler cth1 = new CountingTripleHandler();
        try {
            any23.extract(source, cth1);
        } catch (ExtractionException e) {
            Assert.assertTrue(e.getCause() instanceof RDFParseException);
        }

    }

    /**
     * Test correct management of general <i>XML</i> content.
     *
     * @throws org.apache.any23.extractor.ExtractionException
     *             if there is an error running extraction logic
     * @throws IOException
     *             if there is an error reading the input
     */
    @Test
    public void testXMLMimeTypeManagement() throws IOException, ExtractionException {
        final String documentIRI = "http://www.test.com/resource.xml";
        final String contentType = "application/xml";
        final String in = StreamUtils.asString(this.getClass().getResourceAsStream("any23-xml-mimetype.xml"));
        final DocumentSource doc = new StringDocumentSource(in, documentIRI, contentType);
        final Any23 any23 = new Any23();
        final CountingTripleHandler cth = new CountingTripleHandler(false);
        final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
        final ExtractionReport report = any23.extract(doc, rth);
        Assert.assertFalse(report.hasMatchingExtractors());
        Assert.assertEquals(0, cth.getCount());
    }

    /**
     * Test correct management of general <i>XML</i> content from <i>URL</i> source.
     *
     * @throws org.apache.any23.extractor.ExtractionException
     *             if there is an error running extraction logic
     * @throws IOException
     *             if there is an error reading the input
     */
    @Test
    public void testXMLMimeTypeManagementViaURL() throws IOException, ExtractionException {
        assumeOnlineAllowed();
        final Any23 any23 = new Any23();
        any23.setHTTPUserAgent("apache-any23-test-user-agent");
        HTTPClient client = any23.getHTTPClient();
        HTTPClientConfiguration configuration = new DefaultHTTPClientConfiguration("application/xml");
        client.init(configuration);
        final CountingTripleHandler cth = new CountingTripleHandler(false);
        final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
        final ExtractionReport report = any23.extract("http://www.legislation.gov.uk/ukpga/2015/17/section/4/data.xml",
                rth);
        Assert.assertFalse(report.hasMatchingExtractors());
        Assert.assertEquals(0, cth.getCount());
    }

    @Test
    public void testBlankNodesViaURL() throws IOException, ExtractionException {
        assumeOnlineAllowed();
        final Any23 any23 = new Any23();
        any23.setHTTPUserAgent("apache-any23-test-user-agent");
        final CountingTripleHandler cth = new CountingTripleHandler(false);
        final ReportingTripleHandler rth = new ReportingTripleHandler(cth);
        final ExtractionReport report = any23.extract("https://www.w3.org/", rth);
        Assert.assertTrue(report.hasMatchingExtractors());
    }

    @Test
    public void testMicrodataSupport() throws Exception {
        final String htmlWithMicrodata = IOUtils
                .toString(getClass().getResourceAsStream("/microdata/microdata-basic.html"), StandardCharsets.UTF_8);
        assertExtractorActivation(htmlWithMicrodata, MicrodataExtractor.class);
    }

    @Test
    public void testAbstractMethodErrorIssue186_1() throws IOException, ExtractionException {
        final Any23 runner = new Any23();
        final String content = FileUtils.readResourceContent("/html/rdfa/rdfa-issue186-1.xhtml");
        final DocumentSource source = new StringDocumentSource(content, "http://base.com");
        final ByteArrayOutputStream out = new ByteArrayOutputStream();
        final TripleHandler handler = new NTriplesWriter(out);
        runner.extract(source, handler);
        String n3 = out.toString("UTF-8");
        logger.debug(n3);
    }

    @Test
    public void testAbstractMethodErrorIssue186_2() throws IOException, ExtractionException {
        final Any23 runner = new Any23();
        final String content = FileUtils.readResourceContent("/html/rdfa/rdfa-issue186-2.xhtml");
        final DocumentSource source = new StringDocumentSource(content, "http://richard.cyganiak.de/");
        final ByteArrayOutputStream out = new ByteArrayOutputStream();
        final TripleHandler handler = new NTriplesWriter(out);
        runner.extract(source, handler);
        final String n3 = out.toString("UTF-8");
        logger.debug(n3);
    }

    @Test
    public void testModifiableConfiguration_issue183() throws Exception {
        final ModifiableConfiguration modifiableConf = DefaultConfiguration.copy();
        modifiableConf.setProperty("any23.extraction.metadata.timesize", "off");
        final Any23 any23 = new Any23(modifiableConf);

        final String content = FileUtils.readResourceContent("/rdf/rdf-issue183.ttl");
        final DocumentSource source = new StringDocumentSource(content, "http://base.com");
        final ByteArrayOutputStream out = new ByteArrayOutputStream();
        final TripleHandler handler = new NTriplesWriter(out);
        any23.extract(source, handler);
        handler.close();
        final String n3 = out.toString("UTF-8");

        logger.debug(n3);
        Assert.assertFalse("Should not contain triple with http://vocab.sindice.net/date",
                n3.contains("http://vocab.sindice.net/date"));
        Assert.assertFalse("Should not contain triple with http://vocab.sindice.net/size",
                n3.contains("http://vocab.sindice.net/size"));
    }

    @Test
    public void testIssue415InvalidNTriples() throws Exception {
        NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
        Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));

        ExtractionReport report = runner.extract(IOUtils.resourceToString("/rdf/issue415.txt", StandardCharsets.UTF_8),
                "http://humanstxt.org/humans.txt", new CompositeTripleHandler());
        Assert.assertEquals("text/plain", report.getDetectedMimeType());
        Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
        Assert.assertEquals(0, report.getMatchingExtractors().size());
    }

    @Test
    public void testIssue415ValidNTriples() throws Exception {
        NTriplesExtractorFactory factory = new NTriplesExtractorFactory();
        Any23 runner = new Any23(new ExtractorGroup(Collections.singleton(factory)));

        CountingTripleHandler handler = new CountingTripleHandler();
        ExtractionReport report = runner.extract(
                IOUtils.resourceToString("/rdf/issue415-valid.txt", StandardCharsets.UTF_8),
                "http://humanstxt.org/humans.txt", handler);
        Assert.assertEquals("application/n-triples", report.getDetectedMimeType());
        Assert.assertEquals(0, report.getExtractorIssues(factory.getExtractorName()).size());
        Assert.assertEquals(1, report.getMatchingExtractors().size());
        Assert.assertEquals(1, handler.getCount());
    }

    /**
     * Performs detection and extraction on the given input string and return the {@link ExtractionReport}.
     *
     * @param in
     *            input string.
     *
     * @return a populated {@link org.apache.any23.ExtractionReport}
     *
     * @throws Exception
     *             if there is an error detecting mime type and running extraction
     */
    private ExtractionReport detectAndExtract(String in) throws Exception {
        Any23 any23 = new Any23();
        Configuration conf = DefaultConfiguration.copy();
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        ReportingTripleHandler outputHandler = new ReportingTripleHandler(
                new IgnoreAccidentalRDFa(new IgnoreTitlesOfEmptyDocuments(new NTriplesWriter(out))));
        return any23.extract(new ExtractionParameters(conf, ValidationMode.VALIDATE_AND_FIX, null, null),
                new StringDocumentSource(in, "http://host.com/path"), outputHandler, "UTF-8");
    }

    /**
     * Asserts that a list an {@link Extractor} has been activated for the given input data.
     *
     * @param in
     *            input data as string.
     *
     * @throws IOException
     * @throws ExtractionException
     */
    private void assertDetectionAndExtraction(String in) throws Exception {
        final ExtractionReport extractionReport = detectAndExtract(in);
        Assert.assertTrue("Detection and extraction failed, no matching extractors.",
                extractionReport.hasMatchingExtractors());
    }

    /**
     * Assert the correct activation of the given list of {@link Extractor}s for the given input string.
     *
     * @param in
     *            input data as string.
     * @param expectedExtractors
     *
     * @throws IOException
     * @throws ExtractionException
     */
    private void assertExtractorActivation(String in,
            @SuppressWarnings("rawtypes") Class<? extends Extractor>... expectedExtractors) throws Exception {
        final ExtractionReport extractionReport = detectAndExtract(in);
        for (@SuppressWarnings("rawtypes")
        Class<? extends Extractor> expectedExtractorClass : expectedExtractors) {
            Assert.assertTrue(
                    String.format(Locale.ROOT, "Detection and extraction failed, expected extractor [%s] not found.",
                            expectedExtractorClass),
                    containsClass(extractionReport.getMatchingExtractors(), expectedExtractorClass));
        }
    }

    /**
     * Asserts the correct encoding detection for a specified data.
     *
     * @param encoding
     *            the expected specified encoding, if <code>null</code> will be auto detected.
     * @param input
     * @param expectedContent
     *
     * @throws Exception
     */
    private void assertEncodingDetection(String encoding, String input, String expectedContent) throws Exception {
        DocumentSource fileDocumentSource = getDocumentSourceFromResource(input);
        Any23 any23;
        RepositoryConnection conn = null;
        RepositoryWriter repositoryWriter = null;

        any23 = new Any23();
        Repository store = new SailRepository(new MemoryStore());
        store.init();
        try {
            conn = store.getConnection();
            repositoryWriter = new RepositoryWriter(conn);
            Assert.assertTrue(any23.extract(fileDocumentSource, repositoryWriter, encoding).hasMatchingExtractors());

            RepositoryResult<Statement> statements = conn.getStatements(null, vDCTERMS.title, null, false);
            try {
                while (statements.hasNext()) {
                    Statement statement = statements.next();
                    printStatement(statement);
                    Assert.assertTrue(statement.getObject().stringValue().contains(expectedContent));
                }
            } finally {
                statements.close();
            }
        } finally {
            if (conn != null) {
                conn.close();
            }
            if (repositoryWriter != null) {
                repositoryWriter.close();
            }
        }
        fileDocumentSource = null;
        any23 = null;
    }

    /**
     * Will try to detect the <i>content</i> trying sequentially with all specified parser.
     *
     * @param content
     * @param parsers
     *
     * @throws Exception
     */
    private void assertDetection(String content, String... parsers) throws Exception {
        ByteArrayOutputStream out = new ByteArrayOutputStream();
        Any23 runner = new Any23(parsers.length == 0 ? null : parsers);
        if (parsers.length != 0) {
            runner.setMIMETypeDetector(null); // Use all the provided
                                              // extractors.
        }
        final NTriplesWriter tripleHandler = new NTriplesWriter(out);
        runner.extract(new StringDocumentSource(content, PAGE_URL), tripleHandler);
        tripleHandler.close();
        String result = out.toString("us-ascii");
        Assert.assertNotNull(result);
        Assert.assertTrue(result.length() > 10);
    }

    private void printStatement(Statement statement) {
        logger.debug(String.format(Locale.ROOT, "%s\t%s\t%s", statement.getSubject(), statement.getPredicate(),
                statement.getObject()));
    }

    private boolean containsClass(List<?> list, Class<?> clazz) {
        for (Object o : list) {
            if (o.getClass().equals(clazz)) {
                return true;
            }
        }
        return false;
    }

}