1/*2 * Licensed to the Apache Software Foundation (ASF) under one or more3 * contributor license agreements. See the NOTICE file distributed with4 * this work for additional information regarding copyright ownership.5 * The ASF licenses this file to You under the Apache License, Version 2.06 * (the "License"); you may not use this file except in compliance with7 * the License. You may obtain a copy of the License at8 *9 * http://www.apache.org/licenses/LICENSE-2.010 *11 * Unless required by applicable law or agreed to in writing, software12 * distributed under the License is distributed on an "AS IS" BASIS,13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.14 * See the License for the specific language governing permissions and15 * limitations under the License.16 */1718package org.apache.any23.extractor.rdfa;
1920import org.apache.any23.extractor.ExtractionException;
21import org.apache.any23.extractor.ExtractorFactory;
22import org.apache.any23.rdf.RDFUtils;
23import org.apache.any23.vocab.FOAF;
24import org.apache.any23.vocab.OGP;
25import org.apache.any23.vocab.OGPMusic;
26import org.junit.Assert;
27import org.junit.Test;
28import org.eclipse.rdf4j.model.Literal;
29import org.eclipse.rdf4j.model.Statement;
30import org.eclipse.rdf4j.model.Value;
31import org.eclipse.rdf4j.model.vocabulary.RDF;
32import org.eclipse.rdf4j.repository.RepositoryException;
33import org.eclipse.rdf4j.repository.RepositoryResult;
34import org.eclipse.rdf4j.rio.RDFHandlerException;
35import org.eclipse.rdf4j.rio.RDFParseException;
3637import java.io.IOException;
3839/**40 * Reference test class for {@link RDFa11Extractor} class.41 *42 * @author Michele Mostarda (mostarda@fbk.eu)43 */4445publicclassRDFa11ExtractorTestextendsAbstractRDFaExtractorTestCase {
4647/**48 * This test verifies the correct object resource conversion.49 *50 * @throws RepositoryException51 * if an error is encountered whilst loading content from a storage connection52 */53 @Test
54publicvoid testObjectResourceConversion() throws RepositoryException {
55 assertExtract("/html/rdfa/object-resource-test.html");
56 logger.debug(dumpModelToTurtle());
57 assertContains(null, FOAF.getInstance().page, RDFUtils.iri("http://en.wikipedia.org/New_York"));
58 }
5960 @Test
61publicvoid testBBCNewsScotland() {
62 assertExtract("/html/BBC_News_Scotland.html");
63 assertModelNotEmpty();
64 assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"),
65 RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#navigation"), 1);
66 assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"),
67 RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#search"), 1);
68 assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"),
69 RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#contentinfo"), 1);
70 assertStatementsSize(null, RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#role"),
71 RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#presentation"), 8);
72 }
7374 @Test
75publicvoid testInvalidXMLCharacter() {
76 assertExtract("/html/rdfa/invalid-xml-character.html");
77 assertModelNotEmpty();
78 }
7980 @Test
81publicvoid testAttributeAlreadySpecified() {
82 assertExtract("/html/rdfa/attribute-already-specified.html");
83 assertModelNotEmpty();
84 }
8586 @Test
87publicvoid test0087() {
88 assertExtract("/html/rdfa/0087.xhtml");
89 assertModelNotEmpty();
90 assertStatementsSize(null, null, null, 24);
91 assertContains(RDFUtils.iri("http://www.w3.org/1999/xhtml/vocab#stylesheet"),
92 RDFUtils.iri("http://example.org/stylesheet"));
93 }
9495 @Test
96publicvoid testBasicWithSyntaxErrors() {
97// test issues ANY23-347 and ANY23-35098 assertExtract("/html/rdfa/basic-with-errors.html");
99 assertContains(null, vDCTERMS.creator, RDFUtils.literal("Alice", "en"));
100 assertContains(null, vDCTERMS.title, RDFUtils.literal("The trouble with Bob", "en"));
101 assertContains(null, RDFUtils.iri("http://fake.org/prop"), RDFUtils.literal("Mary", "en"));
102 }
103104 @Test
105publicvoid testIssue326() {
106 assertExtract("/html/rdfa/rdfa-issue326-and-267.html");
107 }
108109 @Test
110publicvoid testIssue227() {
111 assertExtract("/html/rdfa/rdfa-issue227.html");
112 logger.debug(dumpModelToTurtle());
113 assertContains(baseIRI, RDFUtils.iri("http://ogp.me/ns#title"),
114"Bread — Free listening, videos, concerts, stats and photos at Last.fm", "en");
115 }
116117 @Test
118publicvoid testIssue271AndJavascriptParsing() {
119 assertExtract("/html/rdfa/rdfa-issue271-and-317.html");
120 logger.debug(dumpModelToTurtle());
121 assertModelNotEmpty();
122 }
123124 @Test
125publicvoid testIssue273() {
126 assertExtract("/html/rdfa/rdfa-issue273-and-317.html");
127 assertModelNotEmpty();
128 }
129130 @Test
131publicvoid testIssue268And317() {
132 assertExtract("/html/rdfa/rdfa-issue268-and-317.html");
133 }
134135/**136 * This test checks the behavior of the <i>RDFa</i> extraction where the datatype of a property is explicitly set.137 * For details see the <a href="http://www.w3.org/TR/rdfa-syntax/">RDFa in XHTML: Syntax and Processing</a>138 * recommendation.139 *140 * @throws RepositoryException141 * if an error is encountered whilst loading content from a storage connection142 */143 @Test
144publicvoid testExplicitDatatypeDeclaration() throws RepositoryException {
145 assertExtract("/html/rdfa/xmlliteral-datatype-test.html");
146 logger.debug(dumpModelToTurtle());
147148 RepositoryResult<Statement> stmts = conn
149 .getStatements(RDFUtils.iri("http://dbpedia.org/resource/Albert_Einstein"), vFOAF.name, null, false);
150 Assert.assertTrue(stmts.hasNext());
151 Value obj = stmts.next().getObject();
152 Assert.assertTrue(obj instanceof Literal);
153 Literal lit = (Literal) obj;
154 Assert.assertEquals(lit.getDatatype(), RDF.XMLLITERAL);
155 Assert.assertEquals(lit.getLabel(),
156"Albert <strong xmlns=\"http://www.w3.org/1999/xhtml\" " + "xmlns:foaf=\"http://xmlns.com/foaf/0.1/\" "157 + "xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\" "158 + "xmlns:rdfs=\"http://www.w3.org/2000/01/rdf-schema#\" "159 + "xmlns:xsd=\"http://www.w3.org/2001/XMLSchema#\">Einstein</strong>");
160 }
161162/**163 * Tests the correct behavior of <i>REL</i> and <i>HREF</i>.164 *165 * @throws RepositoryException166 * if an error is encountered whilst loading content from a storage connection167 */168 @Test
169publicvoid testRelWithHref() throws RepositoryException {
170 assertExtract("/html/rdfa/rel-href.html");
171 logger.debug(dumpModelToTurtle());
172173 assertContains(RDFUtils.iri(baseIRI.toString(), "#me"), FOAF.getInstance().name, "John Doe");
174 assertContains(RDFUtils.iri(baseIRI.toString(), "#me"), FOAF.getInstance().homepage,
175 RDFUtils.iri("http://example.org/blog/"));
176 }
177178/**179 * This test verifies the correct <em>REL/REV</em> attribute usage.180 *181 * @throws RepositoryException182 * if an error is encountered whilst loading content from a storage connection183 */184 @Test
185publicvoid testRelRevSupport() throws RepositoryException {
186 assertExtract("/html/rdfa/rel-rev.html");
187 logger.debug(dumpModelToTurtle());
188189 assertContains(baseIRI, RDFUtils.iri("http://bob.example.com/cite"),
190 RDFUtils.iri("http://www.example.com/books/the_two_towers"));
191 assertContains(RDFUtils.iri("http://path/to/chapter"), RDFUtils.iri("http://bob.example.com/isChapterOf"),
192 baseIRI);
193 }
194195/**196 * Tests the <em>@vocab</em> support.197 *198 * @throws RepositoryException199 * if an error is encountered whilst loading content from a storage connection200 */201 @Test
202publicvoid testVocabSupport() throws RepositoryException {
203 assertExtract("/html/rdfa/vocab.html");
204 logger.debug(dumpModelToTurtle());
205206 assertContains(RDFUtils.iri(baseIRI.toString(), "#me"), RDFUtils.iri("http://xmlns.com/foaf/0.1/name"),
207 RDFUtils.literal("John Doe"));
208 assertContains(RDFUtils.iri(baseIRI.toString(), "#me"), RDFUtils.iri("http://xmlns.com/foaf/0.1/homepage"),
209 RDFUtils.iri("http://example.org/blog/"));
210 }
211212 @Test
213publicvoid testVocabWithoutTrailingSlash() {
214// test for issue ANY23-428215 assertExtract("/html/rdfa/vocab-without-trailing-slash.html");
216217 assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/BreadcrumbList"));
218 }
219220/**221 * Tests that the default parser settings enable tolerance in data type parsing.222 */223 @Test
224publicvoid testTolerantParsing() {
225 assertExtract("/html/rdfa/oreilly-invalid-datatype.html", false);
226 }
227228/**229 * Taken from the <a href="http://www.heppnetz.de/rdfa4google/testcases.html">GoodRelations test cases</a>. It230 * checks if the extraction is the same when the namespaces are defined in <i>RDFa1.0</i>.231 *232 * @throws RepositoryException233 * if an error is encountered whilst loading content from a storage connection234 * @throws java.io.IOException235 * if there is an error processing input data236 * @throws org.eclipse.rdf4j.rio.RDFHandlerException237 * if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler}238 * @throws org.eclipse.rdf4j.rio.RDFParseException239 * if there is an error parsing input RDF240 */241 @Test
242publicvoid testRDFa10Extraction() throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
243finalint EXPECTED_STATEMENTS = 31;
244245 assertExtract("/html/rdfa/goodrelations-rdfa10.html");
246 logger.debug(dumpModelToNQuads());
247248 Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size());
249 assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq");
250 }
251252/**253 * Taken from the <a href="http://www.heppnetz.de/rdfa4google/testcases.html">GoodRelations test cases</a>. It254 * checks if the extraction is the same when the namespaces are defined in <i>RDFa1.1</i>.255 *256 * @throws RepositoryException257 * if an error is encountered whilst loading content from a storage connection258 * @throws java.io.IOException259 * if there is an error processing input data260 * @throws org.eclipse.rdf4j.rio.RDFHandlerException261 * if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler}262 * @throws org.eclipse.rdf4j.rio.RDFParseException263 * if there is an error parsing input RDF264 */265 @Test
266publicvoid testRDFa11Extraction() throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
267finalint EXPECTED_STATEMENTS = 31;
268269 assertExtract("/html/rdfa/goodrelations-rdfa11.html");
270 logger.debug(dumpHumanReadableTriples());
271272 Assert.assertEquals(EXPECTED_STATEMENTS, dumpAsListOfStatements().size());
273 assertContainsModel("/html/rdfa/goodrelations-rdfa10-expected.nq");
274 }
275276/**277 * Tests the correct support of <a href="http://ogp.me/">Open Graph Protocol's</a>278 * <a href="http://ogp.me/#metadata">Basic Metadata</a>, <a href="http://ogp.me/#optional">Optional Metadata</a>,279 * <a href="http://ogp.me/#structured">Structured Properties</a> and <a href="http://ogp.me/#array">Arrays</a>.280 *281 * @throws IOException282 * if there is an error processing the input data283 * @throws org.apache.any23.extractor.ExtractionException284 * if there is an exception during extraction285 * @throws RepositoryException286 * if an error is encountered whilst loading content from a storage connection287 */288 @Test
289publicvoid testOpenGraphStructuredProperties() throws IOException, ExtractionException, RepositoryException {
290 assertExtract("/html/rdfa/opengraph-structured-properties.html");
291 logger.debug(dumpHumanReadableTriples());
292293 Assert.assertEquals(31, getStatementsSize(null, null, null));
294final OGP vOGP = OGP.getInstance();
295 assertContains(baseIRI, vOGP.audio, RDFUtils.literal("http://example.com/sound.mp3"));
296 assertContains(baseIRI, vOGP.description, RDFUtils
297 .literal("Sean Connery found fame and fortune as the suave, sophisticated British agent, James Bond."));
298 assertContains(baseIRI, vOGP.determiner, RDFUtils.literal("the"));
299 assertContains(baseIRI, vOGP.locale, RDFUtils.literal("en_GB"));
300 assertContains(baseIRI, vOGP.localeAlternate, RDFUtils.literal("fr_FR"));
301 assertContains(baseIRI, vOGP.localeAlternate, RDFUtils.literal("es_ES"));
302 assertContains(baseIRI, vOGP.siteName, RDFUtils.literal("IMDb"));
303 assertContains(baseIRI, vOGP.video, RDFUtils.literal("http://example.com/bond/trailer.swf"));
304 }
305306 @Override
307protected ExtractorFactory<?> getExtractorFactory() {
308returnnew RDFa11ExtractorFactory();
309 }
310311/**312 * Tests the correct support of alternate <a href="http://ogp.me/#types">Open Graph Protocol Object Types</a>313 *314 * @throws IOException315 * if there is an error processing the input data316 * @throws org.apache.any23.extractor.ExtractionException317 * if there is an exception during extraction318 * @throws RepositoryException319 * if an error is encountered whilst loading content from a storage connection320 */321 @Test
322publicvoid testOpenGraphAlternateObjectTypes() throws IOException, ExtractionException, RepositoryException {
323 assertExtract("/html/rdfa/opengraph-music-song-object-type.html");
324 logger.debug(dumpHumanReadableTriples());
325326 Assert.assertEquals(9, getStatementsSize(null, null, null));
327final OGPMusic vOGPMusic = OGPMusic.getInstance();
328 assertContains(baseIRI, vOGPMusic.musicDuration, RDFUtils.literal("447"));
329 assertContains(baseIRI, vOGPMusic.musicMusician,
330 RDFUtils.literal("Jono Grant / Tony McGuinness / Ashley Tomberlin"));
331 assertContains(baseIRI, vOGPMusic.musicAlbum, RDFUtils.literal("Tri-State"));
332 }
333334 }