1/*2 * Licensed to the Apache Software Foundation (ASF) under one or more3 * contributor license agreements. See the NOTICE file distributed with4 * this work for additional information regarding copyright ownership.5 * The ASF licenses this file to You under the Apache License, Version 2.06 * (the "License"); you may not use this file except in compliance with7 * the License. You may obtain a copy of the License at8 *9 * http://www.apache.org/licenses/LICENSE-2.010 *11 * Unless required by applicable law or agreed to in writing, software12 * distributed under the License is distributed on an "AS IS" BASIS,13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.14 * See the License for the specific language governing permissions and15 * limitations under the License.16 */1718package org.apache.any23.extractor.html;
1920import org.apache.any23.extractor.ExtractionException;
21import org.junit.After;
22import org.junit.Assert;
23import org.junit.Test;
24import org.eclipse.rdf4j.repository.RepositoryException;
25import org.w3c.dom.Document;
26import org.w3c.dom.NamedNodeMap;
27import org.w3c.dom.Node;
28import org.w3c.dom.NodeList;
2930import java.io.BufferedInputStream;
31import java.io.ByteArrayInputStream;
32import java.io.ByteArrayOutputStream;
33import java.io.IOException;
34import java.io.InputStream;
35import java.io.PrintStream;
36import java.nio.charset.StandardCharsets;
3738/**39 * Reference Test class for {@link TagSoupParser} parser.40 *41 * @author Davide Palmisano (dpalmisano@gmail.com)42 * @author Michele Mostarda (michele.mostarda@gmail.com)43 *44 */45publicclassTagSoupParserTest {
4647privatestaticfinal String page = "http://semanticweb.org/wiki/Knud_M%C3%B6ller";
4849private TagSoupParser tagSoupParser;
5051 @After
52publicvoid tearDown() throws RepositoryException {
53this.tagSoupParser = null;
5455 }
5657 @Test
58publicvoid testParseSimpleHTML() throws IOException {
59 String html = "<html><head><title>Test</title></head><body><h1>Hello!</h1></body></html>";
60 InputStream input = new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8));
61 Node document = new TagSoupParser(input, "http://example.com/").getDOM();
62 Assert.assertEquals("Test", new HTMLDocument(document).find("//TITLE"));
63 Assert.assertEquals("Hello!", new HTMLDocument(document).find("//H1"));
64 }
6566 @Test
67publicvoid testExplicitEncodingBehavior() throws IOException, ExtractionException, RepositoryException {
68this.tagSoupParser = new TagSoupParser(
69new BufferedInputStream(this.getClass().getResourceAsStream("/html/encoding-test.html")), page,
70"UTF-8");
7172 Assert.assertEquals(this.tagSoupParser.getDOM().getElementsByTagName("title").item(0).getTextContent(),
73"Knud M\u00F6ller - semanticweb.org");
74 }
7576/**77 * This tests the Neko HTML parser without forcing it on using a specific encoding charset. We expect that this test78 * may fail if something changes in the Neko library, as an auto-detection of the encoding.79 *80 * @throws IOException81 * if there is an error interpreting the input data82 * @throws ExtractionException83 * if there is an exception during extraction84 * @throws org.eclipse.rdf4j.repository.RepositoryException85 * if an error is encountered whilst loading content from a storage connection86 */87 @Test
88publicvoid testImplicitEncodingBehavior() throws IOException, ExtractionException, RepositoryException {
89this.tagSoupParser = new TagSoupParser(
90new BufferedInputStream(this.getClass().getResourceAsStream("/html/encoding-test.html")), page);
91 Assert.assertNotSame(this.tagSoupParser.getDOM().getElementsByTagName("title").item(0).getTextContent(),
92"Knud M\u00F6ller - semanticweb.org");
93 }
9495/**96 * Test related to the issue 78 and disabled until the underlying <i>NekoHTML</i> bug has been fixed.97 * 98 * @throws IOException99 * if there is an error interpreting the input data100 */101 @Test
102publicvoid testEmptySpanElements() throws IOException {
103final String page = "http://example.com/test-page";
104 InputStream brokenEmptySpanHtml = new BufferedInputStream(
105this.getClass().getResourceAsStream("/html/empty-span-broken.html"));
106 InputStream worksEmptySpanHtml = new BufferedInputStream(
107this.getClass().getResourceAsStream("/html/empty-span-works.html"));
108this.tagSoupParser = new TagSoupParser(brokenEmptySpanHtml, page);
109 Document brokenElementDom = this.tagSoupParser.getDOM();
110this.tagSoupParser = null; // useless but force GC111112this.tagSoupParser = new TagSoupParser(worksEmptySpanHtml, page);
113 Document worksElementDom = this.tagSoupParser.getDOM();
114115 NodeList brokenNodeList = brokenElementDom.getElementsByTagName("span");
116 Assert.assertEquals(3, brokenNodeList.getLength());
117118 NodeList worksNodeList = worksElementDom.getElementsByTagName("span");
119 Assert.assertEquals(3, worksNodeList.getLength());
120121final ByteArrayOutputStream out1 = new ByteArrayOutputStream();
122 PrintStream psOut1 = new PrintStream(out1, true, StandardCharsets.UTF_8);
123for (int i = 0; i < worksNodeList.getLength(); i++) {
124 printNode(worksNodeList.item(i), psOut1);
125 }
126 psOut1.close();
127128final ByteArrayOutputStream out2 = new ByteArrayOutputStream();
129 PrintStream psOut2 = new PrintStream(out2, true, StandardCharsets.UTF_8);
130for (int i = 0; i < brokenNodeList.getLength(); i++) {
131 printNode(brokenNodeList.item(i), psOut2);
132 }
133 psOut2.close();
134135 Assert.assertEquals(out1.toString(StandardCharsets.UTF_8), out2.toString(StandardCharsets.UTF_8));
136 }
137138privatevoid printNode(Node node, PrintStream printStream) {
139 printStream.println("node name:" + node.getNodeName());
140 printStream.println("node value:" + node.getNodeValue());
141 printStream.println("node has child:" + node.hasChildNodes());
142 printStream.println("node # child:" + node.getChildNodes().getLength());
143144 printStream.println("node child:");
145 NodeList childNodes = node.getChildNodes();
146for (int j = 0; j < childNodes.getLength(); j++) {
147 Node brokenChild = childNodes.item(j);
148 printStream.println(" node name:" + brokenChild.getNodeName());
149 printStream.println(" node type:" + brokenChild.getNodeType());
150 printStream.println(" node value:" + trimValue(brokenChild.getNodeValue()));
151 }
152153 printStream.println("node attributes:");
154 NamedNodeMap namedNodeMap = node.getAttributes();
155for (int j = 0; j < namedNodeMap.getLength(); j++) {
156 Node attribute = namedNodeMap.item(j);
157 printStream.println(" attribute name:" + attribute.getNodeName());
158 printStream.println(" attribute value:" + trimValue(attribute.getNodeValue()));
159 }
160 printStream.println();
161 }
162163private String trimValue(String in) {
164return in == null ? "" : in.trim();
165 }
166167 }