View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.jsoup.nodes.Attribute;
21  import org.jsoup.select.NodeTraversor;
22  import org.jsoup.select.NodeVisitor;
23  import org.w3c.dom.Comment;
24  import org.w3c.dom.Document;
25  import org.w3c.dom.Text;
26  
27  import java.io.IOException;
28  import java.io.InputStream;
29  
30  
31  /**
32   * The parsing configuration for a {@link TagSoupParser}
33   * @author Hans Brende
34   */
35  abstract class TagSoupParsingConfiguration {
36  
37      String name() {
38          return getClass().getSimpleName();
39      }
40  
41      abstract Document parse(InputStream input, String documentIRI, String encoding) throws IOException;
42  
43  
44      static TagSoupParsingConfiguration getDefault() {
45          return JsoupConfig.instance;
46      }
47  
48  
49      private static class JsoupConfig extends TagSoupParsingConfiguration {
50  
51          private static final JsoupConfig instance = new JsoupConfig();
52  
53  
54          @Override
55          Document parse(InputStream input, String documentIRI, String encoding) throws IOException {
56  
57              org.jsoup.nodes.Document document = JsoupUtils.parse(input, documentIRI, encoding);
58  
59              return convert(document);
60          }
61  
62  
63          private static Document convert(org.jsoup.nodes.Document document) {
64              Document w3cDoc = new org.apache.html.dom.HTMLDocumentImpl();
65  
66              org.jsoup.nodes.Element rootEl = document.children().first();
67              if (rootEl != null) {
68                  NodeTraversor.traverse(new DocumentConverter(w3cDoc), rootEl);
69              }
70  
71              return w3cDoc;
72          }
73  
74          private static class DocumentConverter implements NodeVisitor {
75  
76              private final Document doc;
77              private org.w3c.dom.Element dest;
78  
79              DocumentConverter(Document doc) {
80                  this.doc = doc;
81              }
82  
83              @Override
84              public void head(org.jsoup.nodes.Node source, int depth) {
85                  if (source instanceof org.jsoup.nodes.Element) {
86                      org.jsoup.nodes.Element sourceEl = (org.jsoup.nodes.Element) source;
87  
88                      org.w3c.dom.Element el = doc.createElement(sourceEl.tagName());
89                      copyAttributes(sourceEl, el);
90                      if (dest == null) {
91                          doc.appendChild(el);
92                      } else {
93                          dest.appendChild(el);
94                      }
95                      dest = el;
96                  } else if (source instanceof org.jsoup.nodes.TextNode) {
97                      org.jsoup.nodes.TextNode sourceText = (org.jsoup.nodes.TextNode) source;
98                      Text text = doc.createTextNode(sourceText.getWholeText());
99                      dest.appendChild(text);
100                 } else if (source instanceof org.jsoup.nodes.Comment) {
101                     org.jsoup.nodes.Comment sourceComment = (org.jsoup.nodes.Comment) source;
102                     Comment comment = doc.createComment(sourceComment.getData());
103                     dest.appendChild(comment);
104                 } else if (source instanceof org.jsoup.nodes.DataNode) {
105                     org.jsoup.nodes.DataNode sourceData = (org.jsoup.nodes.DataNode) source;
106                     Text node = doc.createTextNode(stripCDATA(sourceData.getWholeData()));
107                     dest.appendChild(node);
108                 }
109             }
110 
111             @Override
112             public void tail(org.jsoup.nodes.Node source, int depth) {
113                 if (source instanceof org.jsoup.nodes.Element && dest.getParentNode() instanceof org.w3c.dom.Element) {
114                     dest = (org.w3c.dom.Element) dest.getParentNode();
115                 }
116             }
117 
118             private void copyAttributes(org.jsoup.nodes.Node source, org.w3c.dom.Element el) {
119                 for (Attribute attribute : source.attributes()) {
120                     // valid xml attribute names are: ^[a-zA-Z_:][-a-zA-Z0-9_:.]
121                     String key = attribute.getKey().replaceAll("[^-a-zA-Z0-9_:.]", "");
122                     if (key.matches("[a-zA-Z_:][-a-zA-Z0-9_:.]*"))
123                         el.setAttribute(key, attribute.getValue());
124                 }
125             }
126         }
127 
128         private static String stripCDATA(String string) {
129             return reduceToContent(string, "<![CDATA[", "]]>");
130         }
131 
132         private static String reduceToContent(String string, String startMarker, String endMarker) {
133             int i = 0;
134             int startContent = -1;
135             int l1 = startMarker.length();
136 
137             int l2;
138             char c;
139             for(l2 = endMarker.length(); i < string.length() - l1 - l2; ++i) {
140                 c = string.charAt(i);
141                 if (!Character.isWhitespace(c)) {
142                     if (c == startMarker.charAt(0) && startMarker.equals(string.substring(i, l1 + i))) {
143                         startContent = i + l1;
144                         break;
145                     }
146 
147                     return string;
148                 }
149             }
150 
151             if (startContent != -1) {
152                 for(i = string.length() - 1; i > startContent + l2; --i) {
153                     c = string.charAt(i);
154                     if (!Character.isWhitespace(c)) {
155                         if (c == endMarker.charAt(l2 - 1) && endMarker.equals(string.substring(i - l2 + 1, i + 1))) {
156 
157                             return string.substring(startContent, i - 2);
158                         }
159 
160                         return string;
161                     }
162                 }
163 
164             }
165             return string;
166         }
167 
168     }
169 
170 
171 }