View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.Extractor;
25  import org.apache.any23.extractor.ExtractorDescription;
26  import org.apache.any23.rdf.RDFUtils;
27  import org.apache.any23.vocab.SINDICE;
28  import org.eclipse.rdf4j.model.IRI;
29  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
30  import org.w3c.dom.Document;
31  import org.w3c.dom.NamedNodeMap;
32  import org.w3c.dom.Node;
33  
34  import java.io.IOException;
35  import java.util.HashMap;
36  import java.util.HashSet;
37  import java.util.List;
38  import java.util.Map;
39  import java.util.Set;
40  
41  /**
42   * This extractor represents the <i>HTML META</i> tag values
43   * according the <a href="http://www.w3.org/TR/html401/struct/global.html#h-7.4.4">HTML4 specification</a>.
44   *
45   * @author Davide Palmisano ( dpalmisano@gmail.com )
46   */
47  public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
48  
49      private static final SINDICE vSINDICE = SINDICE.getInstance();
50  
51      private IRI profile;
52  
53      private Map<String, IRI> prefixes = new HashMap<>();
54  
55      private String documentLang;
56  
57      /**
58       * {@inheritDoc}
59       */
60      @Override
61      public void run(
62              ExtractionParameters extractionParameters,
63              ExtractionContext extractionContext,
64              Document in,
65              ExtractionResult out
66      ) throws IOException, ExtractionException {
67          profile = extractProfile(in);
68          documentLang = getDocumentLanguage(in);
69          extractLinkDefinedPrefixes(in);
70  
71          String baseProfile = vSINDICE.NS;
72          if(profile != null) {
73              baseProfile = profile.toString();
74          }
75  
76          final IRI documentIRI = extractionContext.getDocumentIRI();
77          Set<Meta> metas = extractMetaElement(in, baseProfile);
78          for(Meta meta : metas) {
79              String lang = documentLang;
80              if(meta.getLang() != null) {
81                  lang = meta.getLang();
82              }
83              if(meta.isPragmaDirective){
84                  if(lang != null) {
85                      out.writeTriple(
86                          documentIRI,
87                          meta.getHttpEquiv(),
88                          SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang));
89                  } else {
90                          out.writeTriple(
91                                  documentIRI,
92                                  meta.getHttpEquiv(),
93                                  SimpleValueFactory.getInstance().createLiteral(meta.getContent()));
94                  }
95              } else {
96                  if(lang != null) {
97                      out.writeTriple(
98                          documentIRI,
99                          meta.getName(),
100                         SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang));
101                 } else {
102                     out.writeTriple(
103                             documentIRI,
104                             meta.getName(),
105                             SimpleValueFactory.getInstance().createLiteral(meta.getContent()));
106             	}
107             }
108         }
109     }
110 
111     /**
112      * Returns the {@link Document} language if declared, <code>null</code> otherwise.
113      *
114      * @param in a instance of {@link Document}.
115      * @return the language declared, could be <code>null</code>.
116      */
117     private String getDocumentLanguage(Document in) {
118         String lang = DomUtils.find(in, "string(/HTML/@lang)");
119         if ("".equals(lang)) {
120             return null;
121         }
122         return lang;
123     }
124 
125     private IRI extractProfile(Document in) {
126         String profile = DomUtils.find(in, "string(/HTML/@profile)");
127         if ("".equals(profile)) {
128             return null;
129         }
130         return SimpleValueFactory.getInstance().createIRI(profile);
131     }
132 
133     /**
134      * It extracts prefixes defined in the <i>LINK</i> meta tags.
135      *
136      * @param in
137      */
138     private void extractLinkDefinedPrefixes(Document in) {
139         List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
140         for(Node linkNode : linkNodes) {
141             NamedNodeMap attributes = linkNode.getAttributes();
142             Node relNode = attributes.getNamedItem("rel");
143             String rel = relNode == null ? null : relNode.getTextContent();
144             Node hrefNode = attributes.getNamedItem("href");
145             String href = hrefNode == null ? null : hrefNode.getTextContent();
146             if(rel != null && href !=null && RDFUtils.isAbsoluteIRI(href)) {
147                 prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
148             }
149         }
150     }
151 
152     private Set<Meta> extractMetaElement(Document in, String baseProfile) {
153         List<Node> metaNodes = DomUtils.findAll(in, "/HTML/HEAD/META");
154         Set<Meta> result = new HashSet<>();
155         for (Node metaNode : metaNodes) {
156             NamedNodeMap attributes = metaNode.getAttributes();
157             Node nameAttribute = attributes.getNamedItem("name");
158             Node httpEquivAttribute = attributes.getNamedItem("http-equiv");
159             Node contentAttribute = attributes.getNamedItem("content");
160             if (nameAttribute == null && httpEquivAttribute == null)
161                 continue; //support HTML5 meta element nodes that do not have both name and http-equiv
162             if (nameAttribute != null || httpEquivAttribute != null){
163                 if ( contentAttribute == null ){
164                     continue;
165                 }
166             }
167             boolean isPragmaDirective = (httpEquivAttribute != null) ? true : false;
168             if (isPragmaDirective){
169                 String httpEquiv = httpEquivAttribute.getTextContent();
170                 String content = contentAttribute.getTextContent();
171                 String xpath = DomUtils.getXPathForNode(metaNode);
172                 IRI httpEquivAsIRI = getPrefixIfExists(httpEquiv);
173                 if (httpEquivAsIRI == null) {
174                     httpEquivAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + httpEquiv);
175                 }
176                 Meta meta = new Meta(xpath, content, httpEquivAsIRI);
177                 result.add(meta);
178             } else {
179                 String name = nameAttribute.getTextContent();
180                 String content = contentAttribute.getTextContent();
181                 String xpath = DomUtils.getXPathForNode(metaNode);
182                 IRI nameAsIRI = getPrefixIfExists(name);
183                 if (nameAsIRI == null) {
184                     nameAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + name);
185                 }
186                 Meta meta = new Meta(xpath, nameAsIRI, content);
187                 result.add(meta);
188             }
189         }
190         return result;
191     }
192 
193     private IRI getPrefixIfExists(String name) {
194         String[] split = name.split("\\.");
195         if(split.length == 2 && prefixes.containsKey(split[0])) {
196             return SimpleValueFactory.getInstance().createIRI(prefixes.get(split[0]) + split[1]);
197         }
198         return null;
199     }
200 
201     @Override
202     public ExtractorDescription getDescription() {
203         return HTMLMetaExtractorFactory.getDescriptionInstance();
204     }
205 
206     private class Meta {
207 
208         private String xpath;
209 
210         private IRI name;
211 
212         private IRI httpEquiv;
213 
214         private String lang;
215 
216         private String content;
217 
218         private boolean isPragmaDirective;
219 
220         public Meta(String xpath, String content, IRI httpEquiv) {
221             this.xpath = xpath;
222             this.content = content;
223             this.httpEquiv = httpEquiv;
224             this.setPragmaDirective(true);
225         }
226 
227         @SuppressWarnings("unused")
228         public Meta(String xpath, String content, IRI httpEquiv, String lang) {
229             this(xpath,content,httpEquiv);
230             this.lang = lang;
231         }
232 
233         public Meta(String xpath, IRI name, String content) {
234             this.xpath = xpath;
235             this.name = name;
236             this.content = content;
237         }
238 
239         @SuppressWarnings("unused")
240         public Meta(String xpath, IRI name, String content, String lang) {
241             this(xpath, name, content);
242             this.lang = lang;
243         }
244 
245         private void setPragmaDirective(boolean value){
246             this.isPragmaDirective=value;
247         }
248 
249         public IRI getHttpEquiv(){
250             return httpEquiv;
251         }
252 
253         public IRI getName() {
254             return name;
255         }
256 
257         public String getLang() {
258             return lang;
259         }
260 
261         public String getContent() {
262             return content;
263         }
264 
265         @Override
266         public boolean equals(Object o) {
267             if (this == o)
268                 return true;
269             if (o == null || getClass() != o.getClass())
270                 return false;
271 
272             Meta meta = (Meta) o;
273 
274             if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null)
275                 return false;
276 
277             return true;
278         }
279 
280         @Override
281         public int hashCode() {
282             return xpath != null ? xpath.hashCode() : 0;
283         }
284     }
285 
286 }