This project has retired. For details please refer to its Attic page.
TagSoupParserTest xref
View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionException;
21  import org.junit.After;
22  import org.junit.Assert;
23  import org.junit.Test;
24  import org.eclipse.rdf4j.repository.RepositoryException;
25  import org.w3c.dom.Document;
26  import org.w3c.dom.NamedNodeMap;
27  import org.w3c.dom.Node;
28  import org.w3c.dom.NodeList;
29  
30  import java.io.BufferedInputStream;
31  import java.io.ByteArrayInputStream;
32  import java.io.ByteArrayOutputStream;
33  import java.io.IOException;
34  import java.io.InputStream;
35  import java.io.PrintStream;
36  import java.nio.charset.StandardCharsets;
37  
38  /**
39   * Reference Test class for {@link TagSoupParser} parser.
40   *
41   * @author Davide Palmisano (dpalmisano@gmail.com)
42   * @author Michele Mostarda (michele.mostarda@gmail.com)
43   *
44   */
45  public class TagSoupParserTest {
46  
47      private static final String page = "http://semanticweb.org/wiki/Knud_M%C3%B6ller";
48  
49      private TagSoupParser tagSoupParser;
50  
51      @After
52      public void tearDown() throws RepositoryException {
53          this.tagSoupParser = null;
54  
55      }
56  
57      @Test
58      public void testParseSimpleHTML() throws IOException {
59          String html = "<html><head><title>Test</title></head><body><h1>Hello!</h1></body></html>";
60          InputStream input = new ByteArrayInputStream(html.getBytes(StandardCharsets.UTF_8));
61          Node document = new TagSoupParser(input, "http://example.com/").getDOM();
62          Assert.assertEquals("Test", new HTMLDocument(document).find("//TITLE"));
63          Assert.assertEquals("Hello!", new HTMLDocument(document).find("//H1"));
64      }
65  
66      @Test
67      public void testExplicitEncodingBehavior() throws IOException, ExtractionException, RepositoryException {
68          this.tagSoupParser = new TagSoupParser(
69                  new BufferedInputStream(this.getClass().getResourceAsStream("/html/encoding-test.html")), page,
70                  "UTF-8");
71  
72          Assert.assertEquals(this.tagSoupParser.getDOM().getElementsByTagName("title").item(0).getTextContent(),
73                  "Knud M\u00F6ller - semanticweb.org");
74      }
75  
76      /**
77       * This tests the Neko HTML parser without forcing it on using a specific encoding charset. We expect that this test
78       * may fail if something changes in the Neko library, as an auto-detection of the encoding.
79       *
80       * @throws IOException
81       *             if there is an error interpreting the input data
82       * @throws ExtractionException
83       *             if there is an exception during extraction
84       * @throws org.eclipse.rdf4j.repository.RepositoryException
85       *             if an error is encountered whilst loading content from a storage connection
86       */
87      @Test
88      public void testImplicitEncodingBehavior() throws IOException, ExtractionException, RepositoryException {
89          this.tagSoupParser = new TagSoupParser(
90                  new BufferedInputStream(this.getClass().getResourceAsStream("/html/encoding-test.html")), page);
91          Assert.assertNotSame(this.tagSoupParser.getDOM().getElementsByTagName("title").item(0).getTextContent(),
92                  "Knud M\u00F6ller - semanticweb.org");
93      }
94  
95      /**
96       * Test related to the issue 78 and disabled until the underlying <i>NekoHTML</i> bug has been fixed.
97       * 
98       * @throws IOException
99       *             if there is an error interpreting the input data
100      */
101     @Test
102     public void testEmptySpanElements() throws IOException {
103         final String page = "http://example.com/test-page";
104         InputStream brokenEmptySpanHtml = new BufferedInputStream(
105                 this.getClass().getResourceAsStream("/html/empty-span-broken.html"));
106         InputStream worksEmptySpanHtml = new BufferedInputStream(
107                 this.getClass().getResourceAsStream("/html/empty-span-works.html"));
108         this.tagSoupParser = new TagSoupParser(brokenEmptySpanHtml, page);
109         Document brokenElementDom = this.tagSoupParser.getDOM();
110         this.tagSoupParser = null; // useless but force GC
111 
112         this.tagSoupParser = new TagSoupParser(worksEmptySpanHtml, page);
113         Document worksElementDom = this.tagSoupParser.getDOM();
114 
115         NodeList brokenNodeList = brokenElementDom.getElementsByTagName("span");
116         Assert.assertEquals(3, brokenNodeList.getLength());
117 
118         NodeList worksNodeList = worksElementDom.getElementsByTagName("span");
119         Assert.assertEquals(3, worksNodeList.getLength());
120 
121         final ByteArrayOutputStream out1 = new ByteArrayOutputStream();
122         PrintStream psOut1 = new PrintStream(out1, true, StandardCharsets.UTF_8);
123         for (int i = 0; i < worksNodeList.getLength(); i++) {
124             printNode(worksNodeList.item(i), psOut1);
125         }
126         psOut1.close();
127 
128         final ByteArrayOutputStream out2 = new ByteArrayOutputStream();
129         PrintStream psOut2 = new PrintStream(out2, true, StandardCharsets.UTF_8);
130         for (int i = 0; i < brokenNodeList.getLength(); i++) {
131             printNode(brokenNodeList.item(i), psOut2);
132         }
133         psOut2.close();
134 
135         Assert.assertEquals(out1.toString(StandardCharsets.UTF_8), out2.toString(StandardCharsets.UTF_8));
136     }
137 
138     private void printNode(Node node, PrintStream printStream) {
139         printStream.println("node name:" + node.getNodeName());
140         printStream.println("node value:" + node.getNodeValue());
141         printStream.println("node has child:" + node.hasChildNodes());
142         printStream.println("node # child:" + node.getChildNodes().getLength());
143 
144         printStream.println("node child:");
145         NodeList childNodes = node.getChildNodes();
146         for (int j = 0; j < childNodes.getLength(); j++) {
147             Node brokenChild = childNodes.item(j);
148             printStream.println("    node name:" + brokenChild.getNodeName());
149             printStream.println("    node type:" + brokenChild.getNodeType());
150             printStream.println("    node value:" + trimValue(brokenChild.getNodeValue()));
151         }
152 
153         printStream.println("node attributes:");
154         NamedNodeMap namedNodeMap = node.getAttributes();
155         for (int j = 0; j < namedNodeMap.getLength(); j++) {
156             Node attribute = namedNodeMap.item(j);
157             printStream.println("    attribute name:" + attribute.getNodeName());
158             printStream.println("    attribute value:" + trimValue(attribute.getNodeValue()));
159         }
160         printStream.println();
161     }
162 
163     private String trimValue(String in) {
164         return in == null ? "" : in.trim();
165     }
166 
167 }