1/*2 * Licensed to the Apache Software Foundation (ASF) under one or more3 * contributor license agreements. See the NOTICE file distributed with4 * this work for additional information regarding copyright ownership.5 * The ASF licenses this file to You under the Apache License, Version 2.06 * (the "License"); you may not use this file except in compliance with7 * the License. You may obtain a copy of the License at8 *9 * http://www.apache.org/licenses/LICENSE-2.010 *11 * Unless required by applicable law or agreed to in writing, software12 * distributed under the License is distributed on an "AS IS" BASIS,13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.14 * See the License for the specific language governing permissions and15 * limitations under the License.16 */1718package org.apache.any23.extractor.html;
1920import java.io.FileNotFoundException;
21import java.io.IOException;
2223import org.apache.any23.AbstractAny23TestBase;
24import org.junit.Assert;
25import org.junit.Test;
2627/**28 * Test class to ensure behaviors of {@link HTMLDocument} parser with encoding corner cases.29 */30publicclassEncodingTestextendsAbstractAny23TestBase {
3132privatefinalstatic String HELLO_WORLD = "Hell\u00F6 W\u00F6rld!";
3334 @Test
35publicvoid testEncodingHTML_ISO_8859_1() throws Exception {
36 HTMLDocument document = parseHTML("/microformats/xfn/encoding-iso-8859-1.html");
37 Assert.assertEquals(HELLO_WORLD, document.find("//TITLE"));
38 }
3940 @Test
41publicvoid testEncodingHTML_UTF_8() throws Exception {
42 HTMLDocument document = parseHTML("/microformats/xfn/encoding-utf-8.html");
43 Assert.assertEquals(HELLO_WORLD, document.find("//TITLE"));
44 }
4546/**47 * Known issue: NekoHTML does not auto-detect the encoding, but relies on the explicitly specified encoding (via XML48 * declaration or HTTP-Equiv meta header). If the meta header comes *after* the title element, then NekoHTML will49 * not use the declared encoding for the title.50 *51 * For this test we expect to not recognize the title.52 * 53 * @throws Exception54 * if there is an error asserting the test data.55 */56 @Test
57publicvoid testEncodingHTML_UTF_8_DeclarationAfterTitle() throws Exception {
58 HTMLDocument document = parseHTML("/microformats/xfn/encoding-utf-8-after-title.html");
59 Assert.assertNotSame(HELLO_WORLD, document.find("//TITLE"));
60 }
6162 @Test
63publicvoid testEncodingXHTML_ISO_8859_1() throws Exception {
64 HTMLDocument document = parseHTML("/microformats/xfn/encoding-iso-8859-1.xhtml");
65 Assert.assertEquals(HELLO_WORLD, document.find("//TITLE"));
66 }
6768 @Test
69publicvoid testEncodingXHTML_UTF_8() throws Exception {
70 HTMLDocument document = parseHTML("/microformats/xfn/encoding-utf-8.xhtml");
71 Assert.assertEquals(HELLO_WORLD, document.find("//TITLE"));
72 }
7374private HTMLDocument parseHTML(String filename) throws FileNotFoundException, IOException {
75returnnewHTMLFixture(copyResourceToTempFile(filename)).getHTMLDocument();
76 }
77 }