This project has retired. For details please refer to its Attic page.
SingleDocumentExtractionTest xref
View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor;
19  
20  import org.apache.any23.AbstractAny23TestBase;
21  import org.apache.any23.configuration.DefaultConfiguration;
22  import org.apache.any23.configuration.ModifiableConfiguration;
23  import org.apache.any23.extractor.html.HTMLFixture;
24  import org.apache.any23.mime.TikaMIMETypeDetector;
25  import org.apache.any23.mime.purifier.WhiteSpacesPurifier;
26  import org.apache.any23.vocab.ICAL;
27  import org.apache.any23.vocab.Review;
28  import org.apache.any23.vocab.SINDICE;
29  import org.apache.any23.vocab.VCard;
30  import org.apache.any23.writer.CompositeTripleHandler;
31  import org.apache.any23.writer.RDFXMLWriter;
32  import org.apache.any23.writer.RepositoryWriter;
33  import org.apache.any23.writer.TripleHandlerException;
34  import org.junit.After;
35  import org.junit.Assert;
36  import org.junit.Before;
37  import org.junit.Test;
38  import org.eclipse.rdf4j.model.Resource;
39  import org.eclipse.rdf4j.model.Statement;
40  import org.eclipse.rdf4j.model.IRI;
41  import org.eclipse.rdf4j.model.Value;
42  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
43  import org.eclipse.rdf4j.repository.RepositoryConnection;
44  import org.eclipse.rdf4j.repository.RepositoryException;
45  import org.eclipse.rdf4j.repository.RepositoryResult;
46  import org.eclipse.rdf4j.repository.sail.SailRepository;
47  import org.eclipse.rdf4j.sail.Sail;
48  import org.eclipse.rdf4j.sail.SailException;
49  import org.eclipse.rdf4j.sail.memory.MemoryStore;
50  import org.slf4j.Logger;
51  import org.slf4j.LoggerFactory;
52  
53  import java.io.ByteArrayOutputStream;
54  import java.io.FileNotFoundException;
55  import java.io.IOException;
56  import java.nio.charset.StandardCharsets;
57  import java.util.Locale;
58  
59  /**
60   * Test case for {@link SingleDocumentExtraction}.
61   *
62   * @author Michele Mostarda (mostarda@fbk.eu)
63   * @author Davide Palmisano (palmisano@fbk.eu)
64   */
65  // TODO #20 - Solve issue that hreview item and vcard item have the same BNode due they have the same XPath DOM.
66  public class SingleDocumentExtractionTest extends AbstractAny23TestBase {
67  
68      private static final SINDICE vSINDICE = SINDICE.getInstance();
69      private static final ICAL vICAL = ICAL.getInstance();
70      private static final Review vREVIEW = Review.getInstance();
71      private static final VCard vVCARD = VCard.getInstance();
72  
73      private static final Logger logger = LoggerFactory.getLogger(SingleDocumentExtractionTest.class);
74  
75      private SingleDocumentExtraction singleDocumentExtraction;
76  
77      private ExtractorGroup extractorGroup;
78  
79      private Sail store;
80  
81      private RepositoryConnection conn;
82  
83      RepositoryWriter repositoryWriter;
84  
85      ByteArrayOutputStream baos;
86  
87      RDFXMLWriter rdfxmlWriter;
88  
89      @Before
90      public void setUp() throws Exception {
91          super.setUp();
92          extractorGroup = ExtractorRegistryImpl.getInstance().getExtractorGroup();
93          store = new MemoryStore();
94          store.init();
95          conn = new SailRepository(store).getConnection();
96      }
97  
98      @After
99      public void tearDown() throws SailException, RepositoryException, TripleHandlerException {
100         rdfxmlWriter.close();
101         repositoryWriter.close();
102         logger.debug(baos.toString(StandardCharsets.UTF_8));
103 
104         singleDocumentExtraction = null;
105         extractorGroup = null;
106         conn.close();
107         conn = null;
108         store.shutDown();
109         store = null;
110     }
111 
112     /**
113      * Tests the existence of the domain triples.
114      *
115      * @throws IOException
116      *             if there is an error loading input data
117      * @throws ExtractionException
118      *             if an exception is raised during extraction
119      * @throws RepositoryException
120      *             if an error is encountered whilst loading content from a storage connection
121      */
122     @Test
123     public void testMicroformatDomains() throws IOException, ExtractionException, RepositoryException {
124         singleDocumentExtraction = getInstance("/microformats/microformat-domains.html");
125         singleDocumentExtraction.run();
126         logStorageContent();
127         assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 1);
128     }
129 
130     /**
131      * Tests the nested microformat relationships. This test verifies the first supported approach for microformat
132      * nesting. Such approach foreseen to add a microformat HTML node within the property of a container microformat.
133      *
134      * For further details see {@link SingleDocumentExtraction} consolidateResources(java.util.List, java.util.List,
135      * org.apache.any23.writer.TripleHandler)}
136      *
137      * @throws IOException
138      *             if there is an error loading input data
139      * @throws ExtractionException
140      *             if an exception is raised during extraction
141      * @throws RepositoryException
142      *             if an error is encountered whilst loading content from a storage connection
143      */
144     @Test
145     public void testNestedMicroformats() throws IOException, ExtractionException, RepositoryException {
146         singleDocumentExtraction = getInstance("/microformats/nested-microformats-a1.html");
147         singleDocumentExtraction.run();
148 
149         logStorageContent();
150 
151         assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 2);
152         assertTriple(vSINDICE.getProperty(SINDICE.NESTING), (Value) null);
153         assertTriple(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vICAL.summary);
154         assertTriple(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null);
155     }
156 
157     /**
158      * This test assess the absence of {@link SINDICE} <i>nesting</i> relationship, since
159      * {@link org.apache.any23.extractor.html.HCardExtractor} declared a native nesting with the
160      * {@link org.apache.any23.extractor.html.AdrExtractor}.
161      *
162      * @see org.apache.any23.extractor.html.annotations.Includes
163      * 
164      * @throws IOException
165      *             if there is an error loading input data
166      * @throws ExtractionException
167      *             if an exception is raised during extraction
168      * @throws RepositoryException
169      *             if an error is encountered whilst loading content from a storage connection
170      */
171     @Test
172     public void testNestedVCardAdr() throws IOException, ExtractionException, RepositoryException {
173         singleDocumentExtraction = getInstance("/microformats/nested-microformats-a3.html");
174         singleDocumentExtraction.run();
175 
176         logStorageContent();
177 
178         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), (Value) null, 0);
179         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null, 0);
180     }
181 
182     /**
183      * Tests the nested microformat relationships. This test verifies the second supported approach for microformat
184      * nesting. Such approach foreseen to use the same node attributes to declare both a microformat container property
185      * and a nested microformat root class.
186      *
187      * For further details see {@link SingleDocumentExtraction} consolidateResources(java.util.List, java.util.List,
188      * org.apache.any23.writer.TripleHandler)}
189      *
190      * See also the <a href="http://www.google.com/support/webmasters/bin/answer.py?answer=146862">Nested Entities</a>
191      * article that is linked by the official microformats.org doc page.
192      *
193      * @throws IOException
194      *             if there is an error loading input data
195      * @throws ExtractionException
196      *             if an exception is raised during extraction
197      * @throws RepositoryException
198      *             if an error is encountered whilst loading content from a storage connection
199      */
200     @Test
201     public void testNestedMicroformatsInduced() throws IOException, ExtractionException, RepositoryException {
202         singleDocumentExtraction = getInstance("/microformats/nested-microformats-a2.html");
203         singleDocumentExtraction.run();
204 
205         logStorageContent();
206 
207         assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 2);
208         assertTriple(vSINDICE.getProperty(SINDICE.NESTING), (Value) null);
209         assertTriple(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vICAL.summary);
210         assertTriple(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), (Value) null);
211     }
212 
213     /**
214      * Tests the nested microformat relationships. This test verifies the behavior of the nested microformats when the
215      * nesting relationship is handled by the microformat extractor itself (like the HReview that is able to detect an
216      * inner VCard).
217      *
218      * @throws IOException
219      *             if there is an error loading input data
220      * @throws ExtractionException
221      *             if an exception is raised during extraction
222      * @throws RepositoryException
223      *             if an error is encountered whilst loading content from a storage connection
224      */
225     @Test
226     /*
227      * NOTE: The triple (bnode http://www.w3.org/2006/vcard/ns#url http://pizza.example.com) and (bnode
228      * http://vocab.sindice.net/nesting_original (structured) *) are printed out twice, once for every extractor. The
229      * RDFWriter doesn't remove the duplicates and some graph renderers show the triple property as double. Despite this
230      * the model contains it just once.
231      */
232     public void testNestedMicroformatsManaged() throws IOException, ExtractionException, RepositoryException {
233         singleDocumentExtraction = getInstance("/microformats/nested-microformats-managed.html");
234         singleDocumentExtraction.run();
235 
236         logStorageContent();
237 
238         assertTripleCount(vSINDICE.getProperty(SINDICE.DOMAIN), "nested.test.com", 3);
239         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING), (Value) null, 1);
240         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
241 
242         assertTripleCount(vVCARD.url, (Value) null, 1);
243         Value object = getTripleObject(null, vREVIEW.hasReview);
244         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_STRUCTURED), object, 1);
245         assertTripleCount(vSINDICE.getProperty(SINDICE.NESTING_ORIGINAL), vREVIEW.hasReview, 1);
246     }
247 
248     private SingleDocumentExtraction getInstance(String file) throws FileNotFoundException, IOException {
249         baos = new ByteArrayOutputStream();
250         rdfxmlWriter = new RDFXMLWriter(baos);
251         repositoryWriter = new RepositoryWriter(conn);
252 
253         final CompositeTripleHandler cth = new CompositeTripleHandler();
254         cth.addChild(rdfxmlWriter);
255         cth.addChild(repositoryWriter);
256 
257         final ModifiableConfiguration configuration = DefaultConfiguration.copy();
258         configuration.setProperty("any23.extraction.metadata.domain.per.entity", "on");
259         SingleDocumentExtraction instance = new SingleDocumentExtraction(configuration,
260                 new HTMLFixture(copyResourceToTempFile(file)).getOpener("http://nested.test.com"), extractorGroup, cth);
261         instance.setMIMETypeDetector(new TikaMIMETypeDetector(new WhiteSpacesPurifier()));
262         return instance;
263     }
264 
265     /**
266      * Logs the storage content.
267      * 
268      * @throws RepositoryException
269      *             if an error is encountered whilst loading content from a storage connection
270      */
271     private void logStorageContent() throws RepositoryException {
272         RepositoryResult<Statement> result = conn.getStatements(null, null, null, false);
273         while (result.hasNext()) {
274             Statement statement = result.next();
275             logger.debug(statement.toString());
276         }
277     }
278 
279     /**
280      * Asserts that the triple pattern is present within the storage exactly n times.
281      * 
282      * @param predicate
283      * @param value
284      * @param occurrences
285      * 
286      * @throws RepositoryException
287      */
288     private void assertTripleCount(IRI predicate, Value value, int occurrences) throws RepositoryException {
289         RepositoryResult<Statement> statements = conn.getStatements(null, predicate, value, false);
290         int count = 0;
291         while (statements.hasNext()) {
292             statements.next();
293             count++;
294         }
295         Assert.assertEquals(
296                 String.format(Locale.ROOT, "Cannot find triple (* %s %s) %d times", predicate, value, occurrences),
297                 occurrences, count);
298     }
299 
300     /**
301      * Asserts that the triple pattern is present within the storage exactly n times.
302      *
303      * @param predicate
304      * @param value
305      * @param occurrences
306      * 
307      * @throws RepositoryException
308      */
309     private void assertTripleCount(IRI predicate, String value, int occurrences) throws RepositoryException {
310         assertTripleCount(predicate, SimpleValueFactory.getInstance().createLiteral(value), occurrences);
311     }
312 
313     /**
314      * Asserts that a triple exists exactly once.
315      *
316      * @param predicate
317      * @param value
318      * 
319      * @throws RepositoryException
320      */
321     private void assertTriple(IRI predicate, Value value) throws RepositoryException {
322         assertTripleCount(predicate, value, 1);
323     }
324 
325     /**
326      * Asserts that a triple exists exactly once.
327      *
328      * @param predicate
329      * @param value
330      * 
331      * @throws RepositoryException
332      */
333     @SuppressWarnings("unused")
334     private void assertTriple(IRI predicate, String value) throws RepositoryException {
335         assertTriple(predicate, SimpleValueFactory.getInstance().createLiteral(value));
336     }
337 
338     /**
339      * Retrieves the triple object matching with the given pattern that is expected to be just one.
340      * 
341      * @param sub
342      *            the triple subject, <code>null</code> for any.
343      * @param prop
344      *            the triple property, <code>null</code> for any.
345      * 
346      * @return the object of the unique triple matching the given pattern.
347      * 
348      * @throws RepositoryException
349      *             if an error occurred during the search.
350      */
351     private Value getTripleObject(Resource sub, IRI prop) throws RepositoryException {
352         RepositoryResult<Statement> statements = conn.getStatements(sub, prop, null, false);
353         Assert.assertTrue(statements.hasNext());
354         Statement statement = statements.next();
355         Value value = statement.getObject();
356         Assert.assertFalse("Expected just one result.", statements.hasNext());
357         statements.close();
358         return value;
359     }
360 
361 }