View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  
19  package org.apache.any23.extractor.html;
20  
21  import org.jsoup.Jsoup;
22  import org.jsoup.nodes.Document;
23  import org.jsoup.parser.Parser;
24  
25  import java.io.ByteArrayInputStream;
26  import java.io.IOException;
27  import java.io.InputStream;
28  import java.io.SequenceInputStream;
29  import java.util.Arrays;
30  
31  /**
32   * @author Hans Brende
33   */
34  public class JsoupUtils {
35  
36      public static Document parse(InputStream input, String documentIRI, String encoding) throws IOException {
37          //Jsoup doesn't allow null document URIs
38          if (documentIRI == null) {
39              documentIRI = "";
40          }
41  
42          //workaround for Jsoup issue #1009
43          if (encoding == null) {
44  
45              int c;
46              do {
47                  c = input.read();
48              } while (c != -1 && Character.isWhitespace(c));
49  
50              if (c != -1) {
51                  int capacity = 256;
52                  byte[] bytes = new byte[capacity];
53                  int length = 0;
54                  bytes[length++] = (byte)c;
55  
56                  if (c == '<') {
57                      c = input.read();
58                      if (c != -1) {
59                          bytes[length++] = (byte)c;
60                          if (c == '?') {
61                              c = input.read();
62  
63                              while (c != -1) {
64                                  if (length == capacity) {
65                                      capacity *= 2;
66                                      bytes = Arrays.copyOf(bytes, capacity);
67                                  }
68                                  bytes[length++] = (byte)c;
69  
70                                  if (c == '>') {
71                                      if (length >= 20 && bytes[length - 2] == '?') {
72                                          String decl = "<" + new String(bytes, 2, length - 4) + ">";
73                                          org.jsoup.nodes.Document doc = org.jsoup.Jsoup.parse(decl, documentIRI, Parser.xmlParser());
74                                          for (org.jsoup.nodes.Element el : doc.children()) {
75                                              if ("xml".equalsIgnoreCase(el.tagName())) {
76                                                  String enc = el.attr("encoding");
77                                                  if (enc != null && !enc.isEmpty()) {
78                                                      encoding = enc;
79                                                      break;
80                                                  }
81                                              }
82                                          }
83                                      }
84                                      break;
85                                  }
86  
87                                  c = input.read();
88                              }
89                          }
90                      }
91  
92                  }
93  
94                  input = new SequenceInputStream(new ByteArrayInputStream(bytes, 0, length), input);
95              }
96  
97          }
98  
99          //Use Parser.htmlParser() to parse javascript correctly
100         return Jsoup.parse(input, encoding, documentIRI, Parser.htmlParser());
101     }
102 
103 }