This project has retired. For details please refer to its Attic page.
RDFa11ExtractorTest xref
View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdfa;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.apache.any23.extractor.ExtractorFactory;
22  import org.apache.any23.rdf.RDFUtils;
23  import org.apache.any23.vocab.FOAF;
24  import org.apache.any23.vocab.OGP;
25  import org.apache.any23.vocab.OGPMusic;
26  import org.junit.Assert;
27  import org.junit.Test;
28  import org.eclipse.rdf4j.model.Literal;
29  import org.eclipse.rdf4j.model.Statement;
30  import org.eclipse.rdf4j.model.Value;
31  import org.eclipse.rdf4j.model.vocabulary.RDF;
32  import org.eclipse.rdf4j.repository.RepositoryException;
33  import org.eclipse.rdf4j.repository.RepositoryResult;
34  import org.eclipse.rdf4j.rio.RDFHandlerException;
35  import org.eclipse.rdf4j.rio.RDFParseException;
36  
37  import java.io.IOException;
38  
39  /**
40   * Reference test class for {@link RDFa11Extractor} class.
41   *
42   * @author Michele Mostarda (mostarda@fbk.eu)
43   */
44  
45  public class RDFa11ExtractorTest extends AbstractRDFaExtractorTestCase {
46  
47      /**
48       * This test verifies the correct object resource conversion.
49       *
50       * @throws RepositoryException
51       *             if an error is encountered whilst loading content from a storage connection
52       */
53      @Test
54      public void testObjectResourceConversion() throws RepositoryException {
55          assertExtract("/html/rdfa/object-resource-test.html");
56          logger.debug(dumpModelToTurtle());
57          assertContains(null, FOAF.getInstance().page, RDFUtils.iri("http://en.wikipedia.org/New_York"));
58      }
59  
60      @Test
61      public void testBBCNewsScotland() {
62          assertExtract("/html/BBC_News_Scotland.html");
63          assertModelNotEmpty();
64          assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"),
65                  RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#navigation"), 1);
66          assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"),
67                  RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#search"), 1);
68          assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"),
69                  RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#contentinfo"), 1);
70          assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"),
71                  RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#presentation"), 8);
72      }
73  
74      @Test
75      public void testInvalidXMLCharacter() {
76          assertExtract("/html/rdfa/invalid-xml-character.html");
77          assertModelNotEmpty();
78      }
79  
80      @Test
81      public void testAttributeAlreadySpecified() {
82          assertExtract("/html/rdfa/attribute-already-specified.html");
83          assertModelNotEmpty();
84      }
85  
86      @Test
87      public void test0087() {
88          assertExtract("/html/rdfa/0087.xhtml");
89          assertModelNotEmpty();
90          assertStatementsSize(null, null, null, 24);
91          assertContains(RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#stylesheet"),
92                  RDFUtils.iri("http://example.org/stylesheet"));
93      }
94  
95      @Test
96      public void testBasicWithSyntaxErrors() {
97          // test issues ANY23-347 and ANY23-350
98          assertExtract("/html/rdfa/basic-with-errors.html");
99          assertContains(null, vDCTERMS.creator, RDFUtils.literal("Alice", "en"));
100         assertContains(null, vDCTERMS.title, RDFUtils.literal("The trouble with Bob", "en"));
101         assertContains(null, RDFUtils.iri("http://fake.org/prop"), RDFUtils.literal("Mary", "en"));
102     }
103 
104     @Test
105     public void testIssue326() {
106         assertExtract("/html/rdfa/rdfa-issue326-and-267.html");
107     }
108 
109     @Test
110     public void testIssue227() {
111         assertExtract("/html/rdfa/rdfa-issue227.html");
112         logger.debug(dumpModelToTurtle());
113         assertContains(baseIRI, RDFUtils.iri("http://ogp.me/ns#title"),
114                 "Bread — Free listening, videos, concerts, stats and photos at Last.fm", "en");
115     }
116 
117     @Test
118     public void testIssue271AndJavascriptParsing() {
119         assertExtract("/html/rdfa/rdfa-issue271-and-317.html");
120         logger.debug(dumpModelToTurtle());
121         assertModelNotEmpty();
122     }
123 
124     @Test
125     public void testIssue273() {
126         assertExtract("/html/rdfa/rdfa-issue273-and-317.html");
127         assertModelNotEmpty();
128     }
129 
130     @Test
131     public void testIssue268And317() {
132         assertExtract("/html/rdfa/rdfa-issue268-and-317.html");
133     }
134 
135     /**
136      * This test checks the behavior of the <i>RDFa</i> extraction where the datatype of a property is explicitly set.
137      * For details see the <a href="http://www.w3.org/TR/rdfa-syntax/">RDFa in XHTML: Syntax and Processing</a>
138      * recommendation.
139      *
140      * @throws RepositoryException
141      *             if an error is encountered whilst loading content from a storage connection
142      */
143     @Test
144     public void testExplicitDatatypeDeclaration() throws RepositoryException {
145         assertExtract("/html/rdfa/xmlliteral-datatype-test.html");
146         logger.debug(dumpModelToTurtle());
147 
148         RepositoryResult<Statement> stmts = conn
149                 .getStatements(RDFUtils.iri("http://dbpedia.org/resource/Albert_Einstein"), vFOAF.name, null, false);
150         Assert.assertTrue(stmts.hasNext());
151         Value obj = stmts.next().getObject();
152         Assert.assertTrue(obj instanceof Literal);
153         Literal lit = (Literal) obj;
154         Assert.assertEquals(lit.getDatatype(), RDF.XMLLITERAL);
155         Assert.assertEquals(lit.getLabel(),
156                 "Albert <strong xmlns=\"http://www.w3.org/1999/xhtml\" " + "xmlns:foaf=\"http://xmlns.com/foaf/0.1/\" "
157                         + "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\" "
158                         + "xmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\" "
159                         + "xmlns:xsd=\"http://www.w3.org/2001/XMLSchema#\">Einstein</strong>");
160     }
161 
162     /**
163      * Tests the correct behavior of <i>REL</i> and <i>HREF</i>.
164      *
165      * @throws RepositoryException
166      *             if an error is encountered whilst loading content from a storage connection
167      */
168     @Test
169     public void testRelWithHref() throws RepositoryException {
170         assertExtract("/html/rdfa/rel-href.html");
171         logger.debug(dumpModelToTurtle());
172 
173         assertContains(RDFUtils.iri(baseIRI.toString(), "#me"), FOAF.getInstance().name, "John Doe");
174         assertContains(RDFUtils.iri(baseIRI.toString(), "#me"), FOAF.getInstance().homepage,
175                 RDFUtils.iri("http://example.org/blog/"));
176     }
177 
178     /**
179      * This test verifies the correct <em>REL/REV</em> attribute usage.
180      *
181      * @throws RepositoryException
182      *             if an error is encountered whilst loading content from a storage connection
183      */
184     @Test
185     public void testRelRevSupport() throws RepositoryException {
186         assertExtract("/html/rdfa/rel-rev.html");
187         logger.debug(dumpModelToTurtle());
188 
189         assertContains(baseIRI, RDFUtils.iri("http://bob.example.com/cite"),
190                 RDFUtils.iri("http://www.example.com/books/the_two_towers"));
191         assertContains(RDFUtils.iri("http://path/to/chapter"), RDFUtils.iri("http://bob.example.com/isChapterOf"),
192                 baseIRI);
193     }
194 
195     /**
196      * Tests the <em>@vocab</em> support.
197      *
198      * @throws RepositoryException
199      *             if an error is encountered whilst loading content from a storage connection
200      */
201     @Test
202     public void testVocabSupport() throws RepositoryException {
203         assertExtract("/html/rdfa/vocab.html");
204         logger.debug(dumpModelToTurtle());
205 
206         assertContains(RDFUtils.iri(baseIRI.toString(), "#me"), RDFUtils.iri("http://xmlns.com/foaf/0.1/name"),
207                 RDFUtils.literal("John Doe"));
208         assertContains(RDFUtils.iri(baseIRI.toString(), "#me"), RDFUtils.iri("http://xmlns.com/foaf/0.1/homepage"),
209                 RDFUtils.iri("http://example.org/blog/"));
210     }
211 
212     @Test
213     public void testVocabWithoutTrailingSlash() {
214         // test for issue ANY23-428
215         assertExtract("/html/rdfa/vocab-without-trailing-slash.html");
216 
217         assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/BreadcrumbList"));
218     }
219 
220     /**
221      * Tests that the default parser settings enable tolerance in data type parsing.
222      */
223     @Test
224     public void testTolerantParsing() {
225         assertExtract("/html/rdfa/oreilly-invalid-datatype.html", false);
226     }
227 
228     /**
229      * Taken from the <a href="http://www.heppnetz.de/rdfa4google/testcases.html">GoodRelations test cases</a>. It
230      * checks if the extraction is the same when the namespaces are defined in <i>RDFa1.0</i>.
231      *
232      * @throws RepositoryException
233      *             if an error is encountered whilst loading content from a storage connection
234      * @throws java.io.IOException
235      *             if there is an error processing input data
236      * @throws org.eclipse.rdf4j.rio.RDFHandlerException
237      *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler}
238      * @throws org.eclipse.rdf4j.rio.RDFParseException
239      *             if there is an error parsing input RDF
240      */
241     @Test
242     public void testRDFa10Extraction() throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
243         final int EXPECTED_STATEMENTS = 31;
244 
245         assertExtract("/html/rdfa/goodrelations-rdfa10.html");
246         logger.debug(dumpModelToNQuads());
247 
248         Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size());
249         assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq");
250     }
251 
252     /**
253      * Taken from the <a href="http://www.heppnetz.de/rdfa4google/testcases.html">GoodRelations test cases</a>. It
254      * checks if the extraction is the same when the namespaces are defined in <i>RDFa1.1</i>.
255      *
256      * @throws RepositoryException
257      *             if an error is encountered whilst loading content from a storage connection
258      * @throws java.io.IOException
259      *             if there is an error processing input data
260      * @throws org.eclipse.rdf4j.rio.RDFHandlerException
261      *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler}
262      * @throws org.eclipse.rdf4j.rio.RDFParseException
263      *             if there is an error parsing input RDF
264      */
265     @Test
266     public void testRDFa11Extraction() throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
267         final int EXPECTED_STATEMENTS = 31;
268 
269         assertExtract("/html/rdfa/goodrelations-rdfa11.html");
270         logger.debug(dumpHumanReadableTriples());
271 
272         Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size());
273         assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq");
274     }
275 
276     /**
277      * Tests the correct support of <a href="http://ogp.me/">Open Graph Protocol's</a>
278      * <a href="http://ogp.me/#metadata">Basic Metadata</a>, <a href="http://ogp.me/#optional">Optional Metadata</a>,
279      * <a href="http://ogp.me/#structured">Structured Properties</a> and <a href="http://ogp.me/#array">Arrays</a>.
280      *
281      * @throws IOException
282      *             if there is an error processing the input data
283      * @throws org.apache.any23.extractor.ExtractionException
284      *             if there is an exception during extraction
285      * @throws RepositoryException
286      *             if an error is encountered whilst loading content from a storage connection
287      */
288     @Test
289     public void testOpenGraphStructuredProperties() throws IOException, ExtractionException, RepositoryException {
290         assertExtract("/html/rdfa/opengraph-structured-properties.html");
291         logger.debug(dumpHumanReadableTriples());
292 
293         Assert.assertEquals(31, getStatementsSize(null, null, null));
294         final OGP vOGP = OGP.getInstance();
295         assertContains(baseIRI, vOGP.audio, RDFUtils.literal("http://example.com/sound.mp3"));
296         assertContains(baseIRI, vOGP.description, RDFUtils
297                 .literal("Sean Connery found fame and fortune as the suave, sophisticated British agent, James Bond."));
298         assertContains(baseIRI, vOGP.determiner, RDFUtils.literal("the"));
299         assertContains(baseIRI, vOGP.locale, RDFUtils.literal("en_GB"));
300         assertContains(baseIRI, vOGP.localeAlternate, RDFUtils.literal("fr_FR"));
301         assertContains(baseIRI, vOGP.localeAlternate, RDFUtils.literal("es_ES"));
302         assertContains(baseIRI, vOGP.siteName, RDFUtils.literal("IMDb"));
303         assertContains(baseIRI, vOGP.video, RDFUtils.literal("http://example.com/bond/trailer.swf"));
304     }
305 
306     @Override
307     protected ExtractorFactory<?> getExtractorFactory() {
308         return new RDFa11ExtractorFactory();
309     }
310 
311     /**
312      * Tests the correct support of alternate <a href="http://ogp.me/#types">Open Graph Protocol Object Types</a>
313      *
314      * @throws IOException
315      *             if there is an error processing the input data
316      * @throws org.apache.any23.extractor.ExtractionException
317      *             if there is an exception during extraction
318      * @throws RepositoryException
319      *             if an error is encountered whilst loading content from a storage connection
320      */
321     @Test
322     public void testOpenGraphAlternateObjectTypes() throws IOException, ExtractionException, RepositoryException {
323         assertExtract("/html/rdfa/opengraph-music-song-object-type.html");
324         logger.debug(dumpHumanReadableTriples());
325 
326         Assert.assertEquals(9, getStatementsSize(null, null, null));
327         final OGPMusic vOGPMusic = OGPMusic.getInstance();
328         assertContains(baseIRI, vOGPMusic.musicDuration, RDFUtils.literal("447"));
329         assertContains(baseIRI, vOGPMusic.musicMusician,
330                 RDFUtils.literal("Jono Grant / Tony McGuinness / Ashley Tomberlin"));
331         assertContains(baseIRI, vOGPMusic.musicAlbum, RDFUtils.literal("Tri-State"));
332     }
333 
334 }