This project has retired. For details please refer to its Attic page.
TitleExtractorTest xref
View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractorFactory;
21  import org.apache.any23.rdf.RDFUtils;
22  import org.apache.any23.vocab.DCTerms;
23  import org.apache.any23.vocab.SINDICE;
24  import org.junit.Test;
25  import org.eclipse.rdf4j.model.Literal;
26  import org.eclipse.rdf4j.repository.RepositoryException;
27  
28  /**
29   * Reference Test class for the {@link TitleExtractor} extractor.
30   * 
31   */
32  public class TitleExtractorTest extends AbstractExtractorTestCase {
33  
34      private static final DCTerms vDCTERMS = DCTerms.getInstance();
35      private static final SINDICE vSINDICE = SINDICE.getInstance();
36  
37      private Literal helloLiteral = RDFUtils.literal("Hello World!");
38  
39      @Override
40      protected ExtractorFactory<?> getExtractorFactory() {
41          return new TitleExtractorFactory();
42      }
43  
44      @Test
45      public void testExtractPageTitle() throws RepositoryException {
46          assertExtract("/microformats/xfn/simple-me.html");
47          assertContains(baseIRI, vDCTERMS.title, helloLiteral);
48      }
49  
50      @Test
51      public void testStripSpacesFromTitle() throws RepositoryException {
52          assertExtract("/microformats/xfn/strip-spaces.html");
53          assertContains(baseIRI, vDCTERMS.title, helloLiteral);
54      }
55  
56      @Test
57      public void testNoPageTitle() throws RepositoryException {
58          assertExtract("/microformats/xfn/tagsoup.html");
59          assertModelEmpty();
60      }
61  
62      @Test
63      public void testMixedCaseTitleTag() throws RepositoryException {
64          assertExtract("/microformats/xfn/mixed-case.html");
65          assertContains(baseIRI, vDCTERMS.title, helloLiteral);
66      }
67  
68      /**
69       * This test verifies that when present the default language this is adopted by the title literal.
70       * 
71       * @throws org.eclipse.rdf4j.repository.RepositoryException
72       *             if an error is encountered whilst loading content from a storage connection
73       */
74      @Test
75      public void testTitleWithDefaultLanguage() throws RepositoryException {
76          assertExtract("/html/default-language.html");
77          assertContains(baseIRI, vDCTERMS.title, RDFUtils.literal("Welcome to mydomain.net", "en"));
78          assertNotContains(baseIRI, vDCTERMS.title, RDFUtils.literal("Welcome to mydomain.net", (String) null));
79      }
80  
81  }