1/*2 * Licensed to the Apache Software Foundation (ASF) under one or more3 * contributor license agreements. See the NOTICE file distributed with4 * this work for additional information regarding copyright ownership.5 * The ASF licenses this file to You under the Apache License, Version 2.06 * (the "License"); you may not use this file except in compliance with7 * the License. You may obtain a copy of the License at8 *9 * http://www.apache.org/licenses/LICENSE-2.010 *11 * Unless required by applicable law or agreed to in writing, software12 * distributed under the License is distributed on an "AS IS" BASIS,13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.14 * See the License for the specific language governing permissions and15 * limitations under the License.16 */1718package org.apache.any23.extractor;
1920import org.apache.any23.AbstractAny23TestBase;
21import org.apache.any23.configuration.DefaultConfiguration;
22import org.apache.any23.configuration.ModifiableConfiguration;
23import org.apache.any23.extractor.html.HTMLFixture;
24import org.apache.any23.mime.TikaMIMETypeDetector;
25import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
26import org.apache.any23.vocab.ICAL;
27import org.apache.any23.vocab.Review;
28import org.apache.any23.vocab.SINDICE;
29import org.apache.any23.vocab.VCard;
30import org.apache.any23.writer.CompositeTripleHandler;
31import org.apache.any23.writer.RDFXMLWriter;
32import org.apache.any23.writer.RepositoryWriter;
33import org.apache.any23.writer.TripleHandlerException;
34import org.junit.After;
35import org.junit.Assert;
36import org.junit.Before;
37import org.junit.Test;
38import org.eclipse.rdf4j.model.Resource;
39import org.eclipse.rdf4j.model.Statement;
40import org.eclipse.rdf4j.model.IRI;
41import org.eclipse.rdf4j.model.Value;
42import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
43import org.eclipse.rdf4j.repository.RepositoryConnection;
44import org.eclipse.rdf4j.repository.RepositoryException;
45import org.eclipse.rdf4j.repository.RepositoryResult;
46import org.eclipse.rdf4j.repository.sail.SailRepository;
47import org.eclipse.rdf4j.sail.Sail;
48import org.eclipse.rdf4j.sail.SailException;
49import org.eclipse.rdf4j.sail.memory.MemoryStore;
50import org.slf4j.Logger;
51import org.slf4j.LoggerFactory;
5253import java.io.ByteArrayOutputStream;
54import java.io.FileNotFoundException;
55import java.io.IOException;
56import java.nio.charset.StandardCharsets;
57import java.util.Locale;
5859/**60 * Test case for {@link SingleDocumentExtraction}.61 *62 * @author Michele Mostarda (mostarda@fbk.eu)63 * @author Davide Palmisano (palmisano@fbk.eu)64 */65// TODO #20 - Solve issue that hreview item and vcard item have the same BNode due they have the same XPath DOM.66publicclassSingleDocumentExtractionTestextendsAbstractAny23TestBase {
6768privatestaticfinal SINDICE vSINDICE = SINDICE.getInstance();
69privatestaticfinal ICAL vICAL = ICAL.getInstance();
70privatestaticfinal Review vREVIEW = Review.getInstance();
71privatestaticfinal VCard vVCARD = VCard.getInstance();
7273privatestaticfinal Logger logger = LoggerFactory.getLogger(SingleDocumentExtractionTest.class);
7475private SingleDocumentExtraction singleDocumentExtraction;
7677private ExtractorGroup extractorGroup;
7879private Sail store;
8081private RepositoryConnection conn;
8283 RepositoryWriter repositoryWriter;
8485 ByteArrayOutputStream baos;
8687 RDFXMLWriter rdfxmlWriter;
8889 @Before
90publicvoid setUp() throws Exception {
91super.setUp();
92 extractorGroup = ExtractorRegistryImpl.getInstance().getExtractorGroup();
93 store = new MemoryStore();
94 store.init();
95 conn = new SailRepository(store).getConnection();
96 }
9798 @After
99publicvoid tearDown() throws SailException, RepositoryException, TripleHandlerException {
100 rdfxmlWriter.close();
101 repositoryWriter.close();
102 logger.debug(baos.toString(StandardCharsets.UTF_8));
103104 singleDocumentExtraction = null;
105 extractorGroup = null;
106 conn.close();
107 conn = null;
108 store.shutDown();
109 store = null;
110 }
111112/**113 * Tests the existence of the domain triples.114 *115 * @throws IOException116 * if there is an error loading input data117 * @throws ExtractionException118 * if an exception is raised during extraction119 * @throws RepositoryException120 * if an error is encountered whilst loading content from a storage connection121 */122 @Test
123publicvoid testMicroformatDomains() throws IOException, ExtractionException, RepositoryException {
124 singleDocumentExtraction = getInstance("/microformats/microformat-domains.html");
125 singleDocumentExtraction.run();
126 logStorageContent();
127 assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1);
128 }
129130/**131 * Tests the nested microformat relationships. This test verifies the first supported approach for microformat132 * nesting. Such approach foreseen to add a microformat HTML node within the property of a container microformat.133 *134 * For further details see {@link SingleDocumentExtraction} consolidateResources(java.util.List, java.util.List,135 * org.apache.any23.writer.TripleHandler)}136 *137 * @throws IOException138 * if there is an error loading input data139 * @throws ExtractionException140 * if an exception is raised during extraction141 * @throws RepositoryException142 * if an error is encountered whilst loading content from a storage connection143 */144 @Test
145publicvoid testNestedMicroformats() throws IOException, ExtractionException, RepositoryException {
146 singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html");
147 singleDocumentExtraction.run();
148149 logStorageContent();
150151 assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 2);
152 assertTriple(vSINDICE.getProperty(SINDICE.NESTING), (Value) null);
153 assertTriple(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vICAL.summary);
154 assertTriple(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null);
155 }
156157/**158 * This test assess the absence of {@link SINDICE} <i>nesting</i> relationship, since159 * {@link org.apache.any23.extractor.html.HCardExtractor} declared a native nesting with the160 * {@link org.apache.any23.extractor.html.AdrExtractor}.161 *162 * @see org.apache.any23.extractor.html.annotations.Includes163 * 164 * @throws IOException165 * if there is an error loading input data166 * @throws ExtractionException167 * if an exception is raised during extraction168 * @throws RepositoryException169 * if an error is encountered whilst loading content from a storage connection170 */171 @Test
172publicvoid testNestedVCardAdr() throws IOException, ExtractionException, RepositoryException {
173 singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html");
174 singleDocumentExtraction.run();
175176 logStorageContent();
177178 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), (Value) null, 0);
179 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null, 0);
180 }
181182/**183 * Tests the nested microformat relationships. This test verifies the second supported approach for microformat184 * nesting. Such approach foreseen to use the same node attributes to declare both a microformat container property185 * and a nested microformat root class.186 *187 * For further details see {@link SingleDocumentExtraction} consolidateResources(java.util.List, java.util.List,188 * org.apache.any23.writer.TripleHandler)}189 *190 * See also the <a href="http://www.google.com/support/webmasters/bin/answer.py?answer=146862">Nested Entities</a>191 * article that is linked by the official microformats.org doc page.192 *193 * @throws IOException194 * if there is an error loading input data195 * @throws ExtractionException196 * if an exception is raised during extraction197 * @throws RepositoryException198 * if an error is encountered whilst loading content from a storage connection199 */200 @Test
201publicvoid testNestedMicroformatsInduced() throws IOException, ExtractionException, RepositoryException {
202 singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html");
203 singleDocumentExtraction.run();
204205 logStorageContent();
206207 assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 2);
208 assertTriple(vSINDICE.getProperty(SINDICE.NESTING), (Value) null);
209 assertTriple(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vICAL.summary);
210 assertTriple(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null);
211 }
212213/**214 * Tests the nested microformat relationships. This test verifies the behavior of the nested microformats when the215 * nesting relationship is handled by the microformat extractor itself (like the HReview that is able to detect an216 * inner VCard).217 *218 * @throws IOException219 * if there is an error loading input data220 * @throws ExtractionException221 * if an exception is raised during extraction222 * @throws RepositoryException223 * if an error is encountered whilst loading content from a storage connection224 */225 @Test
226/*227 * NOTE: The triple (bnode http://www.w3.org/2006/vcard/ns#url http://pizza.example.com) and (bnode228 * http://vocab.sindice.net/nesting_original (structured) *) are printed out twice, once for every extractor. The229 * RDFWriter doesn't remove the duplicates and some graph renderers show the triple property as double. Despite this230 * the model contains it just once.231 */232publicvoid testNestedMicroformatsManaged() throws IOException, ExtractionException, RepositoryException {
233 singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html");
234 singleDocumentExtraction.run();
235236 logStorageContent();
237238 assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 3);
239 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING), (Value) null, 1);
240 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
241242 assertTripleCount(vVCARD.url, (Value) null, 1);
243 Value object = getTripleObject(null, vREVIEW.hasReview);
244 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), object, 1);
245 assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
246 }
247248private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException {
249 baos = new ByteArrayOutputStream();
250 rdfxmlWriter = new RDFXMLWriter(baos);
251 repositoryWriter = new RepositoryWriter(conn);
252253final CompositeTripleHandler cth = new CompositeTripleHandler();
254 cth.addChild(rdfxmlWriter);
255 cth.addChild(repositoryWriter);
256257final ModifiableConfiguration configuration = DefaultConfiguration.copy();
258 configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
259 SingleDocumentExtraction instance = new SingleDocumentExtraction(configuration,
260newHTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"), extractorGroup, cth);
261 instance.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier()));
262return instance;
263 }
264265/**266 * Logs the storage content.267 * 268 * @throws RepositoryException269 * if an error is encountered whilst loading content from a storage connection270 */271privatevoid logStorageContent() throws RepositoryException {
272 RepositoryResult<Statement> result = conn.getStatements(null, null, null, false);
273while (result.hasNext()) {
274 Statement statement = result.next();
275 logger.debug(statement.toString());
276 }
277 }
278279/**280 * Asserts that the triple pattern is present within the storage exactly n times.281 * 282 * @param predicate283 * @param value284 * @param occurrences285 * 286 * @throws RepositoryException287 */288privatevoid assertTripleCount(IRI predicate, Value value, int occurrences) throws RepositoryException {
289 RepositoryResult<Statement> statements = conn.getStatements(null, predicate, value, false);
290int count = 0;
291while (statements.hasNext()) {
292 statements.next();
293 count++;
294 }
295 Assert.assertEquals(
296 String.format(Locale.ROOT, "Cannot find triple (* %s %s) %d times", predicate, value, occurrences),
297 occurrences, count);
298 }
299300/**301 * Asserts that the triple pattern is present within the storage exactly n times.302 *303 * @param predicate304 * @param value305 * @param occurrences306 * 307 * @throws RepositoryException308 */309privatevoid assertTripleCount(IRI predicate, String value, int occurrences) throws RepositoryException {
310 assertTripleCount(predicate, SimpleValueFactory.getInstance().createLiteral(value), occurrences);
311 }
312313/**314 * Asserts that a triple exists exactly once.315 *316 * @param predicate317 * @param value318 * 319 * @throws RepositoryException320 */321privatevoid assertTriple(IRI predicate, Value value) throws RepositoryException {
322 assertTripleCount(predicate, value, 1);
323 }
324325/**326 * Asserts that a triple exists exactly once.327 *328 * @param predicate329 * @param value330 * 331 * @throws RepositoryException332 */333 @SuppressWarnings("unused")
334privatevoid assertTriple(IRI predicate, String value) throws RepositoryException {
335 assertTriple(predicate, SimpleValueFactory.getInstance().createLiteral(value));
336 }
337338/**339 * Retrieves the triple object matching with the given pattern that is expected to be just one.340 * 341 * @param sub342 * the triple subject, <code>null</code> for any.343 * @param prop344 * the triple property, <code>null</code> for any.345 * 346 * @return the object of the unique triple matching the given pattern.347 * 348 * @throws RepositoryException349 * if an error occurred during the search.350 */351private Value getTripleObject(Resource sub, IRI prop) throws RepositoryException {
352 RepositoryResult<Statement> statements = conn.getStatements(sub, prop, null, false);
353 Assert.assertTrue(statements.hasNext());
354 Statement statement = statements.next();
355 Value value = statement.getObject();
356 Assert.assertFalse("Expected just one result.", statements.hasNext());
357 statements.close();
358return value;
359 }
360361 }