This project has retired. For details please refer to its Attic page.
EncodingTest xref
View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import java.io.FileNotFoundException;
21  import java.io.IOException;
22  
23  import org.apache.any23.AbstractAny23TestBase;
24  import org.junit.Assert;
25  import org.junit.Test;
26  
27  /**
28   * Test class to ensure behaviors of {@link HTMLDocument} parser with encoding corner cases.
29   */
30  public class EncodingTest extends AbstractAny23TestBase {
31  
32      private final static String HELLO_WORLD = "Hell\u00F6 W\u00F6rld!";
33  
34      @Test
35      public void testEncodingHTML_ISO_8859_1() throws Exception {
36          HTMLDocument document = parseHTML("/microformats/xfn/encoding-iso-8859-1.html");
37          Assert.assertEquals(HELLO_WORLD, document.find("//TITLE"));
38      }
39  
40      @Test
41      public void testEncodingHTML_UTF_8() throws Exception {
42          HTMLDocument document = parseHTML("/microformats/xfn/encoding-utf-8.html");
43          Assert.assertEquals(HELLO_WORLD, document.find("//TITLE"));
44      }
45  
46      /**
47       * Known issue: NekoHTML does not auto-detect the encoding, but relies on the explicitly specified encoding (via XML
48       * declaration or HTTP-Equiv meta header). If the meta header comes *after* the title element, then NekoHTML will
49       * not use the declared encoding for the title.
50       *
51       * For this test we expect to not recognize the title.
52       * 
53       * @throws Exception
54       *             if there is an error asserting the test data.
55       */
56      @Test
57      public void testEncodingHTML_UTF_8_DeclarationAfterTitle() throws Exception {
58          HTMLDocument document = parseHTML("/microformats/xfn/encoding-utf-8-after-title.html");
59          Assert.assertNotSame(HELLO_WORLD, document.find("//TITLE"));
60      }
61  
62      @Test
63      public void testEncodingXHTML_ISO_8859_1() throws Exception {
64          HTMLDocument document = parseHTML("/microformats/xfn/encoding-iso-8859-1.xhtml");
65          Assert.assertEquals(HELLO_WORLD, document.find("//TITLE"));
66      }
67  
68      @Test
69      public void testEncodingXHTML_UTF_8() throws Exception {
70          HTMLDocument document = parseHTML("/microformats/xfn/encoding-utf-8.xhtml");
71          Assert.assertEquals(HELLO_WORLD, document.find("//TITLE"));
72      }
73  
74      private HTMLDocument parseHTML(String filename) throws FileNotFoundException, IOException {
75          return new HTMLFixture(copyResourceToTempFile(filename)).getHTMLDocument();
76      }
77  }