View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.Extractor;
25  import org.apache.any23.extractor.ExtractorDescription;
26  import org.apache.any23.rdf.RDFUtils;
27  import org.apache.any23.vocab.SINDICE;
28  import org.eclipse.rdf4j.model.IRI;
29  import org.eclipse.rdf4j.model.impl.SimpleValueFactory;
30  import org.w3c.dom.Document;
31  import org.w3c.dom.NamedNodeMap;
32  import org.w3c.dom.Node;
33  
34  import java.io.IOException;
35  import java.util.HashMap;
36  import java.util.HashSet;
37  import java.util.List;
38  import java.util.Map;
39  import java.util.Set;
40  
41  /**
42   * This extractor represents the <i>HTML META</i> tag values according the
43   * <a href="http://www.w3.org/TR/html401/struct/global.html#h-7.4.4">HTML4 specification</a>.
44   *
45   * @author Davide Palmisano ( dpalmisano@gmail.com )
46   */
47  public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
48  
49      private static final SINDICE vSINDICE = SINDICE.getInstance();
50  
51      private IRI profile;
52  
53      private Map<String, IRI> prefixes = new HashMap<>();
54  
55      private String documentLang;
56  
57      /**
58       * {@inheritDoc}
59       */
60      @Override
61      public void run(ExtractionParameters extractionParameters, ExtractionContext extractionContext, Document in,
62              ExtractionResult out) throws IOException, ExtractionException {
63          profile = extractProfile(in);
64          documentLang = getDocumentLanguage(in);
65          extractLinkDefinedPrefixes(in);
66  
67          String baseProfile = vSINDICE.NS;
68          if (profile != null) {
69              baseProfile = profile.toString();
70          }
71  
72          final IRI documentIRI = extractionContext.getDocumentIRI();
73          Set<Meta> metas = extractMetaElement(in, baseProfile);
74          for (Meta meta : metas) {
75              String lang = documentLang;
76              if (meta.getLang() != null) {
77                  lang = meta.getLang();
78              }
79              if (meta.isPragmaDirective) {
80                  if (lang != null) {
81                      out.writeTriple(documentIRI, meta.getHttpEquiv(),
82                              SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang));
83                  } else {
84                      out.writeTriple(documentIRI, meta.getHttpEquiv(),
85                              SimpleValueFactory.getInstance().createLiteral(meta.getContent()));
86                  }
87              } else {
88                  if (lang != null) {
89                      out.writeTriple(documentIRI, meta.getName(),
90                              SimpleValueFactory.getInstance().createLiteral(meta.getContent(), lang));
91                  } else {
92                      out.writeTriple(documentIRI, meta.getName(),
93                              SimpleValueFactory.getInstance().createLiteral(meta.getContent()));
94                  }
95              }
96          }
97      }
98  
99      /**
100      * Returns the {@link Document} language if declared, <code>null</code> otherwise.
101      *
102      * @param in
103      *            a instance of {@link Document}.
104      * 
105      * @return the language declared, could be <code>null</code>.
106      */
107     private String getDocumentLanguage(Document in) {
108         String lang = DomUtils.find(in, "string(/HTML/@lang)");
109         if ("".equals(lang)) {
110             return null;
111         }
112         return lang;
113     }
114 
115     private IRI extractProfile(Document in) {
116         String profile = DomUtils.find(in, "string(/HTML/@profile)");
117         if ("".equals(profile)) {
118             return null;
119         }
120         return SimpleValueFactory.getInstance().createIRI(profile);
121     }
122 
123     /**
124      * It extracts prefixes defined in the <i>LINK</i> meta tags.
125      *
126      * @param in
127      */
128     private void extractLinkDefinedPrefixes(Document in) {
129         List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
130         for (Node linkNode : linkNodes) {
131             NamedNodeMap attributes = linkNode.getAttributes();
132             Node relNode = attributes.getNamedItem("rel");
133             String rel = relNode == null ? null : relNode.getTextContent();
134             Node hrefNode = attributes.getNamedItem("href");
135             String href = hrefNode == null ? null : hrefNode.getTextContent();
136             if (rel != null && href != null && RDFUtils.isAbsoluteIRI(href)) {
137                 prefixes.put(rel, SimpleValueFactory.getInstance().createIRI(href));
138             }
139         }
140     }
141 
142     private Set<Meta> extractMetaElement(Document in, String baseProfile) {
143         List<Node> metaNodes = DomUtils.findAll(in, "/HTML/HEAD/META");
144         Set<Meta> result = new HashSet<>();
145         for (Node metaNode : metaNodes) {
146             NamedNodeMap attributes = metaNode.getAttributes();
147             Node nameAttribute = attributes.getNamedItem("name");
148             Node httpEquivAttribute = attributes.getNamedItem("http-equiv");
149             Node contentAttribute = attributes.getNamedItem("content");
150             if (nameAttribute == null && httpEquivAttribute == null)
151                 continue; // support HTML5 meta element nodes that do not have both name and http-equiv
152             if (nameAttribute != null || httpEquivAttribute != null) {
153                 if (contentAttribute == null) {
154                     continue;
155                 }
156             }
157             boolean isPragmaDirective = (httpEquivAttribute != null) ? true : false;
158             if (isPragmaDirective) {
159                 String httpEquiv = httpEquivAttribute.getTextContent();
160                 String content = contentAttribute.getTextContent();
161                 String xpath = DomUtils.getXPathForNode(metaNode);
162                 IRI httpEquivAsIRI = getPrefixIfExists(httpEquiv);
163                 if (httpEquivAsIRI == null) {
164                     httpEquivAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + httpEquiv);
165                 }
166                 Meta meta = new Meta(xpath, content, httpEquivAsIRI);
167                 result.add(meta);
168             } else {
169                 String name = nameAttribute.getTextContent();
170                 String content = contentAttribute.getTextContent();
171                 String xpath = DomUtils.getXPathForNode(metaNode);
172                 IRI nameAsIRI = getPrefixIfExists(name);
173                 if (nameAsIRI == null) {
174                     nameAsIRI = SimpleValueFactory.getInstance().createIRI(baseProfile + name);
175                 }
176                 Meta meta = new Meta(xpath, nameAsIRI, content);
177                 result.add(meta);
178             }
179         }
180         return result;
181     }
182 
183     private IRI getPrefixIfExists(String name) {
184         String[] split = name.split("\\.");
185         if (split.length == 2 && prefixes.containsKey(split[0])) {
186             return SimpleValueFactory.getInstance().createIRI(prefixes.get(split[0]) + split[1]);
187         }
188         return null;
189     }
190 
191     @Override
192     public ExtractorDescription getDescription() {
193         return HTMLMetaExtractorFactory.getDescriptionInstance();
194     }
195 
196     private static class Meta {
197 
198         private String xpath;
199 
200         private IRI name;
201 
202         private IRI httpEquiv;
203 
204         private String lang;
205 
206         private String content;
207 
208         private boolean isPragmaDirective;
209 
210         public Meta(String xpath, String content, IRI httpEquiv) {
211             this.xpath = xpath;
212             this.content = content;
213             this.httpEquiv = httpEquiv;
214             this.setPragmaDirective(true);
215         }
216 
217         @SuppressWarnings("unused")
218         public Meta(String xpath, String content, IRI httpEquiv, String lang) {
219             this(xpath, content, httpEquiv);
220             this.lang = lang;
221         }
222 
223         public Meta(String xpath, IRI name, String content) {
224             this.xpath = xpath;
225             this.name = name;
226             this.content = content;
227         }
228 
229         @SuppressWarnings("unused")
230         public Meta(String xpath, IRI name, String content, String lang) {
231             this(xpath, name, content);
232             this.lang = lang;
233         }
234 
235         private void setPragmaDirective(boolean value) {
236             this.isPragmaDirective = value;
237         }
238 
239         public IRI getHttpEquiv() {
240             return httpEquiv;
241         }
242 
243         public IRI getName() {
244             return name;
245         }
246 
247         public String getLang() {
248             return lang;
249         }
250 
251         public String getContent() {
252             return content;
253         }
254 
255         @Override
256         public boolean equals(Object o) {
257             if (this == o)
258                 return true;
259             if (o == null || getClass() != o.getClass())
260                 return false;
261 
262             Meta meta = (Meta) o;
263 
264             if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null)
265                 return false;
266 
267             return true;
268         }
269 
270         @Override
271         public int hashCode() {
272             return xpath != null ? xpath.hashCode() : 0;
273         }
274     }
275 
276 }