View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.rdfa;
19  
20  import org.apache.commons.lang3.ArrayUtils;
21  import org.jsoup.nodes.CDataNode;
22  import org.jsoup.nodes.Comment;
23  import org.jsoup.nodes.Element;
24  import org.jsoup.nodes.Node;
25  import org.jsoup.nodes.TextNode;
26  import org.jsoup.select.NodeVisitor;
27  import org.semarglproject.sink.XmlSink;
28  import org.xml.sax.SAXException;
29  import org.xml.sax.helpers.AttributesImpl;
30  import org.xml.sax.helpers.NamespaceSupport;
31  
32  import java.util.ArrayList;
33  
34  /**
35   * @author Hans Brende (hansbrende@apache.org)
36   */
37  class JsoupScanner implements NodeVisitor {
38  
39      private final NamespaceSupport ns = new NamespaceSupport();
40      private final AttributesImpl attrs = new AttributesImpl();
41      private final String[] nameParts = new String[3];
42  
43      private final XmlSink handler;
44  
45      JsoupScanner(XmlSink handler) {
46          this.handler = handler;
47      }
48  
49      private static String orEmpty(String str) {
50          return str == null ? "" : str;
51      }
52  
53      private static final String[] commonHashDelimitedVocabs = { "http://creativecommons.org/ns",
54              "http://www.w3.org/2002/07/owl", "http://www.w3.org/1999/02/22-rdf-syntax-ns", "http://www.w3.org/ns/rdfa",
55              "http://www.w3.org/2000/01/rdf-schema", "http://www.w3.org/1999/xhtml/vocab",
56              "http://www.w3.org/2001/XMLSchema", "http://microformats.org/profile/hcard",
57              "http://www.w3.org/2006/vcard/ns", "http://ogp.me/ns", "http://ogp.me/ns/music", "http://ogp.me/ns/video",
58              "http://ogp.me/ns/article", "http://ogp.me/ns/book", "http://ogp.me/ns/profile",
59              "http://ogp.me/ns/website" };
60  
61      private void startElement(Element e) throws SAXException {
62          ns.pushContext();
63  
64          attrs.clear();
65          final ArrayList<String> remainingAttrs = new ArrayList<>();
66          for (org.jsoup.nodes.Attribute attr : e.attributes()) {
67              String name = attr.getKey();
68              String value = attr.getValue();
69              if (name.startsWith("xmlns")) {
70                  if (name.length() == 5) {
71                      ns.declarePrefix("", value);
72                      handler.startPrefixMapping("", value);
73                      continue;
74                  } else if (name.charAt(5) == ':') {
75                      String localName = name.substring(6);
76                      ns.declarePrefix(localName, value);
77                      handler.startPrefixMapping(localName, value);
78                      continue;
79                  }
80              } else if (name.equalsIgnoreCase("vocab")) {
81                  // Fix for ANY23-428
82                  name = "vocab";
83                  value = value.trim();
84                  int len = value.length();
85                  char lastChar;
86                  if (len != 0 && (lastChar = value.charAt(len - 1)) != '/' && lastChar != '#' && lastChar != ':') {
87                      if (ArrayUtils.contains(commonHashDelimitedVocabs, value)) {
88                          value += "#";
89                      } else {
90                          value += "/";
91                      }
92                  }
93              }
94  
95              remainingAttrs.add(name);
96              remainingAttrs.add(value);
97          }
98  
99          for (int i = 0, len = remainingAttrs.size(); i < len; i += 2) {
100             String name = remainingAttrs.get(i);
101             String value = remainingAttrs.get(i + 1);
102             String[] parts = ns.processName(name, nameParts, true);
103             if (parts != null) {
104                 attrs.addAttribute(orEmpty(parts[0]), orEmpty(parts[1]), parts[2], "CDATA", value);
105             }
106         }
107 
108         String qName = e.tagName();
109 
110         String[] parts = ns.processName(qName, nameParts, false);
111         if (parts == null) {
112             handler.startElement("", "", qName, attrs);
113         } else {
114             handler.startElement(orEmpty(parts[0]), orEmpty(parts[1]), parts[2], attrs);
115         }
116 
117     }
118 
119     private void endElement(Element e) throws SAXException {
120 
121         String qName = e.tagName();
122         String[] parts = ns.processName(qName, nameParts, false);
123         if (parts == null) {
124             handler.endElement("", "", qName);
125         } else {
126             handler.endElement(orEmpty(parts[0]), orEmpty(parts[1]), parts[2]);
127         }
128 
129         for (org.jsoup.nodes.Attribute attr : e.attributes()) {
130             String name = attr.getKey();
131             if (name.startsWith("xmlns")) {
132                 if (name.length() == 5) {
133                     handler.endPrefixMapping("");
134                 } else if (name.charAt(5) == ':') {
135                     String localName = name.substring(6);
136                     handler.endPrefixMapping(localName);
137                 }
138             }
139         }
140 
141         ns.popContext();
142     }
143 
144     private void handleText(String str) throws SAXException {
145         handler.characters(str.toCharArray(), 0, str.length());
146     }
147 
148     private void handleComment(String str) throws SAXException {
149         handler.comment(str.toCharArray(), 0, str.length());
150     }
151 
152     @Override
153     public void head(Node node, int depth) {
154         try {
155             if (node instanceof Element) {
156                 startElement((Element) node);
157             } else if (node instanceof CDataNode) {
158                 handler.startCDATA();
159                 handleText(((CDataNode) node).text());
160             } else if (node instanceof TextNode) {
161                 handleText(((TextNode) node).text());
162                 // TODO support document types
163                 // } else if (node instanceof DocumentType) {
164                 // DocumentType dt = (DocumentType)node;
165                 // handler.startDTD(dt.attr("name"), orNull(dt.attr("publicId")), orNull(dt.attr("systemId")));
166             } else if (node instanceof Comment) {
167                 handleComment(((Comment) node).getData());
168             }
169         } catch (SAXException e) {
170             sneakyThrow(e);
171         }
172     }
173 
174     @Override
175     public void tail(Node node, int depth) {
176         try {
177             if (node instanceof Element) {
178                 endElement((Element) node);
179             } else if (node instanceof CDataNode) {
180                 handler.endCDATA();
181                 // TODO support document types
182                 // } else if (node instanceof DocumentType) {
183                 // handler.endDTD();
184             }
185         } catch (SAXException e) {
186             sneakyThrow(e);
187         }
188     }
189 
190     @SuppressWarnings("unchecked")
191     private static <E extends Throwable> void sneakyThrow(Throwable e) throws E {
192         throw (E) e;
193     }
194 }