View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.jsoup.Jsoup;
21  import org.jsoup.nodes.Document;
22  import org.jsoup.parser.Parser;
23  
24  import java.io.ByteArrayInputStream;
25  import java.io.IOException;
26  import java.io.InputStream;
27  import java.io.SequenceInputStream;
28  import java.nio.charset.StandardCharsets;
29  import java.util.Arrays;
30  
31  /**
32   * @author Hans Brende
33   */
34  public class JsoupUtils {
35  
36      public static Document parse(InputStream input, String documentIRI, String encoding) throws IOException {
37          // Jsoup doesn't allow null document URIs
38          if (documentIRI == null) {
39              documentIRI = "";
40          }
41  
42          // workaround for Jsoup issue #1009
43          if (encoding == null) {
44  
45              int c;
46              do {
47                  c = input.read();
48              } while (c != -1 && Character.isWhitespace(c));
49  
50              if (c != -1) {
51                  int capacity = 256;
52                  byte[] bytes = new byte[capacity];
53                  int length = 0;
54                  bytes[length++] = (byte) c;
55  
56                  if (c == '<') {
57                      c = input.read();
58                      if (c != -1) {
59                          bytes[length++] = (byte) c;
60                          if (c == '?') {
61                              c = input.read();
62  
63                              while (c != -1) {
64                                  if (length == capacity) {
65                                      capacity *= 2;
66                                      bytes = Arrays.copyOf(bytes, capacity);
67                                  }
68                                  bytes[length++] = (byte) c;
69  
70                                  if (c == '>') {
71                                      if (length >= 20 && bytes[length - 2] == '?') {
72                                          String decl = "<" + new String(bytes, 2, length - 4, StandardCharsets.UTF_8)
73                                                  + ">";
74                                          org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(decl, documentIRI,
75                                                  Parser.xmlParser());
76                                          for (org.jsoup.nodes.Element el : doc.children()) {
77                                              if ("xml".equalsIgnoreCase(el.tagName())) {
78                                                  String enc = el.attr("encoding");
79                                                  if (enc != null && !enc.isEmpty()) {
80                                                      encoding = enc;
81                                                      break;
82                                                  }
83                                              }
84                                          }
85                                      }
86                                      break;
87                                  }
88  
89                                  c = input.read();
90                              }
91                          }
92                      }
93  
94                  }
95  
96                  input = new SequenceInputStream(new ByteArrayInputStream(bytes, 0, length), input);
97              }
98  
99          }
100 
101         // Use Parser.htmlParser() to parse javascript correctly
102         return Jsoup.parse(input, encoding, documentIRI, Parser.htmlParser());
103     }
104 
105 }