View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.jsoup.nodes.Attribute;
21  import org.jsoup.select.NodeTraversor;
22  import org.jsoup.select.NodeVisitor;
23  import org.w3c.dom.Comment;
24  import org.w3c.dom.Document;
25  import org.w3c.dom.Text;
26  
27  import java.io.IOException;
28  import java.io.InputStream;
29  
30  /**
31   * The parsing configuration for a {@link TagSoupParser}
32   * 
33   * @author Hans Brende
34   */
35  abstract class TagSoupParsingConfiguration {
36  
37      String name() {
38          return getClass().getSimpleName();
39      }
40  
41      abstract Document parse(InputStream input, String documentIRI, String encoding) throws IOException;
42  
43      static TagSoupParsingConfiguration getDefault() {
44          return JsoupConfig.instance;
45      }
46  
47      private static class JsoupConfig extends TagSoupParsingConfiguration {
48  
49          private static final JsoupConfig instance = new JsoupConfig();
50  
51          @Override
52          Document parse(InputStream input, String documentIRI, String encoding) throws IOException {
53  
54              org.jsoup.nodes.Document document = JsoupUtils.parse(input, documentIRI, encoding);
55  
56              return convert(document);
57          }
58  
59          private static Document convert(org.jsoup.nodes.Document document) {
60              Document w3cDoc = new org.apache.html.dom.HTMLDocumentImpl();
61  
62              org.jsoup.nodes.Element rootEl = document.children().first();
63              if (rootEl != null) {
64                  NodeTraversor.traverse(new DocumentConverter(w3cDoc), rootEl);
65              }
66  
67              return w3cDoc;
68          }
69  
70          private static class DocumentConverter implements NodeVisitor {
71  
72              private final Document doc;
73              private org.w3c.dom.Element dest;
74  
75              DocumentConverter(Document doc) {
76                  this.doc = doc;
77              }
78  
79              @Override
80              public void head(org.jsoup.nodes.Node source, int depth) {
81                  if (source instanceof org.jsoup.nodes.Element) {
82                      org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
83  
84                      org.w3c.dom.Element el = doc.createElement(sourceEl.tagName());
85                      copyAttributes(sourceEl, el);
86                      if (dest == null) {
87                          doc.appendChild(el);
88                      } else {
89                          dest.appendChild(el);
90                      }
91                      dest = el;
92                  } else if (source instanceof org.jsoup.nodes.TextNode) {
93                      org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
94                      Text text = doc.createTextNode(sourceText.getWholeText());
95                      dest.appendChild(text);
96                  } else if (source instanceof org.jsoup.nodes.Comment) {
97                      org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
98                      Comment comment = doc.createComment(sourceComment.getData());
99                      dest.appendChild(comment);
100                 } else if (source instanceof org.jsoup.nodes.DataNode) {
101                     org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
102                     Text node = doc.createTextNode(stripCDATA(sourceData.getWholeData()));
103                     dest.appendChild(node);
104                 }
105             }
106 
107             @Override
108             public void tail(org.jsoup.nodes.Node source, int depth) {
109                 if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof org.w3c.dom.Element) {
110                     dest = (org.w3c.dom.Element) dest.getParentNode();
111                 }
112             }
113 
114             private void copyAttributes(org.jsoup.nodes.Node source, org.w3c.dom.Element el) {
115                 for (Attribute attribute : source.attributes()) {
116                     // valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.]
117                     String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
118                     if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*"))
119                         el.setAttribute(key, attribute.getValue());
120                 }
121             }
122         }
123 
124         private static String stripCDATA(String string) {
125             return reduceToContent(string, "<![CDATA[", "]]>");
126         }
127 
128         private static String reduceToContent(String string, String startMarker, String endMarker) {
129             int i = 0;
130             int startContent = -1;
131             int l1 = startMarker.length();
132 
133             int l2;
134             char c;
135             for (l2 = endMarker.length(); i < string.length() - l1 - l2; ++i) {
136                 c = string.charAt(i);
137                 if (!Character.isWhitespace(c)) {
138                     if (c == startMarker.charAt(0) && startMarker.equals(string.substring(i, l1 + i))) {
139                         startContent = i + l1;
140                         break;
141                     }
142 
143                     return string;
144                 }
145             }
146 
147             if (startContent != -1) {
148                 for (i = string.length() - 1; i > startContent + l2; --i) {
149                     c = string.charAt(i);
150                     if (!Character.isWhitespace(c)) {
151                         if (c == endMarker.charAt(l2 - 1) && endMarker.equals(string.substring(i - l2 + 1, i + 1))) {
152 
153                             return string.substring(startContent, i - 2);
154                         }
155 
156                         return string;
157                     }
158                 }
159 
160             }
161             return string;
162         }
163 
164     }
165 
166 }