This project has retired. For details please refer to its Attic page.
MicrodataExtractorTest xref
View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.microdata;
19  
20  import org.apache.any23.Any23;
21  import org.apache.any23.Any23OnlineTestBase;
22  import org.apache.any23.configuration.DefaultConfiguration;
23  import org.apache.any23.configuration.ModifiableConfiguration;
24  import org.apache.any23.extractor.ExtractionException;
25  import org.apache.any23.extractor.ExtractorFactory;
26  import org.apache.any23.extractor.IssueReport;
27  import org.apache.any23.extractor.html.AbstractExtractorTestCase;
28  import org.apache.any23.extractor.rdf.TurtleExtractorFactory;
29  import org.apache.any23.rdf.RDFUtils;
30  import org.apache.any23.source.DocumentSource;
31  import org.apache.any23.source.HTTPDocumentSource;
32  import org.apache.any23.writer.TripleWriterHandler;
33  import org.eclipse.rdf4j.model.IRI;
34  import org.eclipse.rdf4j.model.Model;
35  import org.eclipse.rdf4j.model.Value;
36  import org.eclipse.rdf4j.model.Literal;
37  import org.eclipse.rdf4j.model.Resource;
38  import org.eclipse.rdf4j.model.impl.TreeModel;
39  import org.eclipse.rdf4j.model.util.Models;
40  import org.eclipse.rdf4j.model.vocabulary.RDF;
41  import org.eclipse.rdf4j.model.vocabulary.RDFS;
42  import org.slf4j.Logger;
43  import org.slf4j.LoggerFactory;
44  import org.junit.Assert;
45  import org.junit.Test;
46  import org.eclipse.rdf4j.model.BNode;
47  import org.eclipse.rdf4j.model.Statement;
48  import org.eclipse.rdf4j.repository.RepositoryException;
49  import org.eclipse.rdf4j.rio.RDFFormat;
50  import org.eclipse.rdf4j.rio.RDFHandler;
51  import org.eclipse.rdf4j.rio.RDFHandlerException;
52  import org.eclipse.rdf4j.rio.RDFParseException;
53  import org.eclipse.rdf4j.rio.RDFParser;
54  import org.eclipse.rdf4j.rio.Rio;
55  
56  import java.io.File;
57  import java.io.FileReader;
58  import java.io.IOException;
59  import java.nio.charset.StandardCharsets;
60  import java.util.ArrayDeque;
61  import java.util.ArrayList;
62  import java.util.Arrays;
63  import java.util.Collections;
64  import java.util.HashMap;
65  import java.util.List;
66  import java.util.Map;
67  import java.util.TreeMap;
68  import java.util.concurrent.atomic.AtomicInteger;
69  
70  /**
71   * Reference test class for {@link MicrodataExtractor}.
72   *
73   * @author Davide Palmisano ( dpalmisano@gmail.com )
74   */
75  public class MicrodataExtractorTest extends AbstractExtractorTestCase {
76  
77      private static final Logger logger = LoggerFactory.getLogger(MicrodataExtractorTest.class);
78  
79      @Override
80      protected ExtractorFactory<?> getExtractorFactory() {
81          return new MicrodataExtractorFactory();
82      }
83  
84      /**
85       * Reference test for <a href="http://schema.org">Schema.org</a>.
86       *
87       * @throws ExtractionException
88       *             if an exception is raised during extraction
89       * @throws RepositoryException
90       *             if an error is encountered whilst loading content from a storage connection
91       * @throws RDFHandlerException
92       *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler} implementation
93       * @throws IOException
94       *             if there is an error loading input data
95       * @throws RDFParseException
96       *             if there is an error parsing an actual RDF stream
97       */
98      @Test
99      public void testSchemaOrgNestedProps()
100             throws RepositoryException, RDFHandlerException, IOException, RDFParseException, ExtractionException {
101         extractAndVerifyAgainstNQuads("microdata-nested.html", "microdata-nested-expected.nquads");
102         logger.debug(dumpModelToNQuads());
103     }
104 
105     @Test
106     public void testUnusedItemprop() {
107         // Test for ANY23-154
108         assertExtract("/microdata/unused-itemprop.html");
109         assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Offer"));
110     }
111 
112     @Test
113     public void testExample2() {
114         // Property URI generation for hcard
115         assertExtract("/microdata/example2.html");
116         assertContains(null, RDF.TYPE, RDFUtils.iri("http://microformats.org/profile/hcard"));
117         assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#given-name"), (Value) null);
118         assertContains(null, RDFUtils.iri("http://microformats.org/profile/hcard#n"), (Value) null);
119     }
120 
121     @Test
122     public void testExample5() {
123         // Vocabulary expansion for schema.org
124         assertExtract("/microdata/example5.html");
125         assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Person"));
126         assertContains(null, RDF.TYPE, RDFUtils.iri("http://xmlns.com/foaf/0.1/Person"));
127         assertContains(null, RDFUtils.iri("http://schema.org/additionalType"),
128                 RDFUtils.iri("http://xmlns.com/foaf/0.1/Person"));
129         assertContains(null, RDFUtils.iri("http://schema.org/email"), RDFUtils.iri("mailto:mail@gmail.com"));
130         assertContains(null, RDFUtils.iri("http://xmlns.com/foaf/0.1/mbox"), RDFUtils.iri("mailto:mail@gmail.com"));
131     }
132 
133     private static final List<String> ignoredOnlineTestNames = Arrays.asList("Test 0073", // Vocabulary Expansion test
134                                                                                           // with rdfs:subPropertyOf
135             "Test 0074" // Vocabulary Expansion test with owl:equivalentProperty
136     );
137 
138     private static Any23 createRunner(String extractorName) {
139         ModifiableConfiguration config = DefaultConfiguration.copy();
140         config.setProperty("any23.microdata.strict", DefaultConfiguration.FLAG_PROPERTY_ON);
141         Any23 runner = new Any23(config, extractorName);
142         runner.setHTTPUserAgent("apache-any23-test-user-agent");
143         return runner;
144     }
145 
146     @Test
147     public void runOnlineTests() throws Exception {
148 
149         Any23OnlineTestBase.assumeOnlineAllowed();
150 
151         Any23 ttlRunner = createRunner(TurtleExtractorFactory.NAME);
152         DocumentSource source = new HTTPDocumentSource(ttlRunner.getHTTPClient(),
153                 "https://w3c.github.io/microdata-rdf/tests/manifest.ttl");
154         HashMap<Resource, HashMap<IRI, ArrayDeque<Value>>> map = new HashMap<>(256);
155         ttlRunner.extract(source, new TripleWriterHandler() {
156             public void writeTriple(Resource s, IRI p, Value o, Resource g) {
157                 map.computeIfAbsent(s, k -> new HashMap<>()).computeIfAbsent(p, k -> new ArrayDeque<>()).add(o);
158             }
159 
160             public void writeNamespace(String prefix, String uri) {
161             }
162 
163             public void close() {
164             }
165         });
166 
167         Assert.assertFalse(map.isEmpty());
168 
169         final IRI actionPred = RDFUtils.iri("http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#action");
170         final IRI resultPred = RDFUtils.iri("http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#result");
171         final IRI namePred = RDFUtils.iri("http://www.w3.org/2001/sw/DataAccess/tests/test-manifest#name");
172 
173         AtomicInteger passedTests = new AtomicInteger();
174         AtomicInteger ignoredTests = new AtomicInteger();
175         Map<String, String> failedTests = Collections.synchronizedMap(new TreeMap<>());
176 
177         map.values().parallelStream().forEach(item -> {
178             ArrayDeque<Value> types = item.get(RDF.TYPE);
179             if (types == null) {
180                 return;
181             }
182             boolean positive;
183             label: {
184                 for (Value type : types) {
185                     if (type.stringValue().startsWith("http://www.w3.org/ns/rdftest#TestMicrodataNegative")) {
186                         positive = false;
187                         break label;
188                     } else if (type.stringValue().startsWith("http://www.w3.org/ns/rdftest#TestMicrodata")) {
189                         positive = true;
190                         break label;
191                     }
192                 }
193                 return;
194             }
195             IRI action = (IRI) item.get(actionPred).pop();
196             IRI result = (IRI) (item.containsKey(resultPred) ? item.get(resultPred).pop() : null);
197             String name = ((Literal) item.get(namePred).pop()).getLabel();
198             if (ignoredOnlineTestNames.contains(name)) {
199                 ignoredTests.incrementAndGet();
200                 return;
201             }
202             try {
203                 name += ": " + ((Literal) item.get(RDFS.COMMENT).pop()).getLabel();
204                 TreeModel actual = new TreeModel();
205                 createRunner(MicrodataExtractorFactory.NAME).extract(action.stringValue(), new TripleWriterHandler() {
206                     public void writeTriple(Resource s, IRI p, Value o, Resource g) {
207                         if (MicrodataExtractor.MICRODATA_ITEM.equals(p))
208                             return;
209                         actual.add(s, p, o);
210                     }
211 
212                     public void writeNamespace(String prefix, String uri) {
213                     }
214 
215                     public void close() {
216                     }
217                 });
218 
219                 TreeModel expected = new TreeModel();
220                 if (result != null) {
221                     createRunner(TurtleExtractorFactory.NAME).extract(result.stringValue(), new TripleWriterHandler() {
222                         public void writeTriple(Resource s, IRI p, Value o, Resource g) {
223                             // TODO: remove this if-block after https://github.com/w3c/microdata-rdf/issues/30 has been
224                             // resolved
225                             if (o instanceof IRI
226                                     && o.stringValue().equals("http://w3c.github.io/author/jd_salinger.html")) {
227                                 o = RDFUtils.iri("https://w3c.github.io/author/jd_salinger.html");
228                             }
229 
230                             expected.add(s, p, o);
231                         }
232 
233                         public void writeNamespace(String prefix, String uri) {
234                         }
235 
236                         public void close() {
237                         }
238                     });
239                 }
240 
241                 boolean testPassed = positive == Models.isomorphic(expected, actual);
242                 if (testPassed) {
243                     passedTests.incrementAndGet();
244                 } else {
245                     StringBuilder error = new StringBuilder("\n" + name + "\n");
246                     error.append(action).append(positive ? " ==> " : " =/=> ").append(result).append("\n");
247 
248                     HashMap<Value, String> m = new HashMap<>();
249                     AtomicInteger i = new AtomicInteger();
250                     int match = 0;
251                     for (Statement st : expected) {
252                         Resource s = st.getSubject();
253                         Value o = st.getObject();
254 
255                         if (actual.stream().noneMatch(t -> st.getPredicate().equals(t.getPredicate())
256                                 && (s instanceof BNode ? t.getSubject() instanceof BNode : s.equals(t.getSubject()))
257                                 && (o instanceof BNode ? t.getObject() instanceof BNode : o.equals(t.getObject())))) {
258                             if (positive) {
259                                 Object sstr = s instanceof BNode ? m.computeIfAbsent(s, k -> "_:" + i.getAndIncrement())
260                                         : s;
261                                 Object ostr = o instanceof BNode ? m.computeIfAbsent(o, k -> "_:" + i.getAndIncrement())
262                                         : o;
263                                 error.append("EXPECT: ").append(sstr).append(" ").append(st.getPredicate()).append(" ")
264                                         .append(ostr).append("\n");
265                             }
266                         } else {
267                             match++;
268                         }
269                     }
270                     error.append("...").append(match).append(" statements in common...\n");
271 
272                     for (Statement st : actual) {
273                         Resource s = st.getSubject();
274                         Value o = st.getObject();
275 
276                         if (expected.stream().noneMatch(t -> st.getPredicate().equals(t.getPredicate())
277                                 && (s instanceof BNode ? t.getSubject() instanceof BNode : s.equals(t.getSubject()))
278                                 && (o instanceof BNode ? t.getObject() instanceof BNode : o.equals(t.getObject())))) {
279                             if (positive) {
280                                 Object sstr = s instanceof BNode ? m.computeIfAbsent(s, k -> "_:" + i.getAndIncrement())
281                                         : s;
282                                 Object ostr = o instanceof BNode ? m.computeIfAbsent(o, k -> "_:" + i.getAndIncrement())
283                                         : o;
284                                 error.append("ACTUAL: ").append(sstr).append(" ").append(st.getPredicate()).append(" ")
285                                         .append(ostr).append("\n");
286                             }
287                         }
288                     }
289 
290                     failedTests.put(name, error.toString());
291                 }
292             } catch (Exception e) {
293                 failedTests.put(name, "\n" + e.toString() + "\n");
294             }
295         });
296 
297         if (logger.isDebugEnabled()) {
298             logger.debug("passed=" + passedTests.get() + "; ignored=" + ignoredTests.get());
299         }
300 
301         if (!failedTests.isEmpty()) {
302             Assert.fail(failedTests.size() + " failures out of " + (failedTests.size() + passedTests.get())
303                     + " total tests\n" + String.join("\n", failedTests.keySet()) + "\n\n"
304                     + String.join("\n", failedTests.values()));
305         }
306     }
307 
308     @Test
309     public void testMicrodataBasic() {
310         assertExtract("/microdata/microdata-basic.html");
311         assertModelNotEmpty();
312         assertStatementsSize(null, null, null, 40);
313         assertStatementsSize(RDFUtils.iri("urn:isbn:0-330-34032-8"), null, null, 4);
314     }
315 
316     @Test
317     public void testMicrodataMissingScheme() {
318         assertExtract("/microdata/microdata-missing-scheme.html");
319         assertModelNotEmpty();
320         assertContains(null, RDF.TYPE, RDFUtils.iri("http://schema.org/Answer"));
321     }
322 
323     /**
324      * Reference test as provided by
325      * <a href="http://googlewebmastercentral.blogspot.com/2010/03/microdata-support-for-rich-snippets.html">Google Rich
326      * Snippet for Microdata.</a>
327      *
328      * @throws RepositoryException
329      *             if an error is encountered whilst loading content from a storage connection
330      * @throws RDFHandlerException
331      *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler} implementation
332      * @throws IOException
333      *             if there is an error loading input data
334      * @throws RDFParseException
335      *             if there is an error parsing an actual RDF stream
336      */
337     @Test
338     public void testMicrodataGoogleRichSnippet()
339             throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
340         extractAndVerifyAgainstNQuads("microdata-richsnippet.html", "microdata-richsnippet-expected.nquads");
341         logger.debug(dumpHumanReadableTriples());
342     }
343 
344     /**
345      * First reference test for <a href="http://www.w3.org/TR/microdata/">Microdata Extraction algorithm</a>.
346      *
347      * @throws RepositoryException
348      *             if an error is encountered whilst loading content from a storage connection
349      * @throws RDFHandlerException
350      *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler} implementation
351      * @throws IOException
352      *             if there is an error loading input data
353      * @throws RDFParseException
354      *             if there is an error parsing an actual RDF stream
355      */
356     @Test
357     public void testExample5221() throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
358         extractAndVerifyAgainstNQuads("5.2.1-non-normative-example-1.html",
359                 "5.2.1-non-normative-example-1-expected.nquads");
360         logger.debug(dumpHumanReadableTriples());
361     }
362 
363     /**
364      * Second reference test for <a href="http://www.w3.org/TR/microdata/">Microdata Extraction algorithm</a>.
365      *
366      * @throws RepositoryException
367      *             if an error is encountered whilst loading content from a storage connection
368      * @throws RDFHandlerException
369      *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler} implementation
370      * @throws IOException
371      *             if there is an error loading input data
372      * @throws RDFParseException
373      *             if there is an error parsing an actual RDF stream
374      */
375     @Test
376     public void testExample5222() throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
377         extractAndVerifyAgainstNQuads("5.2.1-non-normative-example-2.html",
378                 "5.2.1-non-normative-example-2-expected.nquads");
379         logger.debug(dumpHumanReadableTriples());
380     }
381 
382     /**
383      * First reference test for <a href="http://schema.org/">http://schema.org/</a>.
384      *
385      * @throws RepositoryException
386      *             if an error is encountered whilst loading content from a storage connection
387      * @throws RDFHandlerException
388      *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler} implementation
389      * @throws IOException
390      *             if there is an error loading input data
391      * @throws RDFParseException
392      *             if there is an error parsing an actual RDF stream
393      */
394     @Test
395     public void testExampleSchemaOrg1()
396             throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
397         extractAndVerifyAgainstNQuads("schemaorg-example-1.html", "schemaorg-example-1-expected.nquads");
398         logger.debug(dumpHumanReadableTriples());
399     }
400 
401     /**
402      * Second reference test for <a href="http://schema.org/">http://schema.org/</a>.
403      *
404      * @throws RepositoryException
405      *             if an error is encountered whilst loading content from a storage connection
406      * @throws RDFHandlerException
407      *             if there is an error in the {@link org.eclipse.rdf4j.rio.RDFHandler} implementation
408      * @throws IOException
409      *             if there is an error loading input data
410      * @throws RDFParseException
411      *             if there is an error parsing an actual RDF stream
412      */
413     @Test
414     public void testExampleSchemaOrg2()
415             throws RDFHandlerException, RepositoryException, IOException, RDFParseException {
416         extractAndVerifyAgainstNQuads("schemaorg-example-2.html", "schemaorg-example-2-expected.nquads");
417         logger.debug(dumpHumanReadableTriples());
418     }
419 
420     @Test
421     public void testMicrodataNestedUrlResolving() throws IOException {
422         IRI oldBaseIRI = baseIRI;
423         try {
424             baseIRI = RDFUtils.iri("https://ruben.verborgh.org/tmp/schemaorg-test.html");
425             extractAndVerifyAgainstNQuads("microdata-nested-url-resolving.html",
426                     "microdata-nested-url-resolving-expected.nquads");
427         } finally {
428             baseIRI = oldBaseIRI;
429         }
430     }
431 
432     @Test
433     public void testTel() {
434         assertExtract("/microdata/tel-test.html");
435         assertModelNotEmpty();
436         assertContains(RDFUtils.iri("http://schema.org/telephone"), RDFUtils.iri("tel:(909)%20484-2020"));
437     }
438 
439     @Test
440     public void testBadTypes() throws IOException {
441         extractAndVerifyAgainstNQuads("microdata-bad-types.html", "microdata-bad-types-expected.nquads");
442     }
443 
444     @Test
445     public void testBadPropertyNames() throws IOException {
446         extractAndVerifyAgainstNQuads("microdata-bad-properties.html", "microdata-bad-properties-expected.nquads",
447                 false);
448         assertIssue(IssueReport.IssueLevel.ERROR,
449                 ".*invalid property name ''.*\"path\" : \"/HTML\\[1\\]/BODY\\[1\\]/DIV\\[1\\]/DIV\\[2\\]/DIV\\[1\\]\".*");
450     }
451 
452     private void extractAndVerifyAgainstNQuads(String actual, String expected)
453             throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
454         extractAndVerifyAgainstNQuads(actual, expected, true);
455     }
456 
457     private void extractAndVerifyAgainstNQuads(String actual, String expected, boolean assertNoIssues)
458             throws RepositoryException, RDFHandlerException, IOException, RDFParseException {
459         assertExtract("/microdata/" + actual, assertNoIssues);
460         assertModelNotEmpty();
461         logger.debug(dumpModelToNQuads());
462         List<Statement> expectedStatements = loadResultStatement("/microdata/" + expected);
463         int actualStmtSize = getStatementsSize(null, null, null);
464         Assert.assertEquals(expectedStatements.size(), actualStmtSize);
465         for (Statement statement : expectedStatements) {
466             assertContains(statement.getSubject() instanceof BNode ? null : statement.getSubject(),
467                     statement.getPredicate(), statement.getObject() instanceof BNode ? null : statement.getObject());
468         }
469         Model expectedModel = new TreeModel();
470         for (Statement s : expectedStatements) {
471             expectedModel.add(s.getSubject(), s.getPredicate(), s.getObject());
472         }
473 
474         Model actualModel = new TreeModel();
475         conn.export(new RDFHandler() {
476             @Override
477             public void startRDF() throws RDFHandlerException {
478             }
479 
480             @Override
481             public void endRDF() throws RDFHandlerException {
482             }
483 
484             @Override
485             public void handleNamespace(String s, String s1) throws RDFHandlerException {
486             }
487 
488             @Override
489             public void handleStatement(Statement statement) throws RDFHandlerException {
490                 actualModel.add(statement.getSubject(), statement.getPredicate(), statement.getObject());
491             }
492 
493             @Override
494             public void handleComment(String s) throws RDFHandlerException {
495             }
496         });
497 
498         Assert.assertTrue("Models are not isomorphic", Models.isomorphic(expectedModel, actualModel));
499     }
500 
501     private List<Statement> loadResultStatement(String resultFilePath)
502             throws RDFHandlerException, IOException, RDFParseException {
503         RDFParser nQuadsParser = Rio.createParser(RDFFormat.NQUADS);
504         TestRDFHandler rdfHandler = new TestRDFHandler();
505         nQuadsParser.setRDFHandler(rdfHandler);
506         File file = copyResourceToTempFile(resultFilePath);
507         nQuadsParser.parse(new FileReader(file, StandardCharsets.UTF_8), baseIRI.toString());
508         return rdfHandler.getStatements();
509     }
510 
511     public static class TestRDFHandler implements RDFHandler {
512 
513         private final List<Statement> statements = new ArrayList<Statement>();
514 
515         protected List<Statement> getStatements() {
516             return statements;
517         }
518 
519         public void startRDF() throws RDFHandlerException {
520         }
521 
522         public void endRDF() throws RDFHandlerException {
523         }
524 
525         public void handleNamespace(String s, String s1) throws RDFHandlerException {
526             throw new UnsupportedOperationException();
527         }
528 
529         public void handleStatement(Statement statement) throws RDFHandlerException {
530             statements.add(statement);
531         }
532 
533         public void handleComment(String s) throws RDFHandlerException {
534             throw new UnsupportedOperationException();
535         }
536     }
537 
538 }