View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.Extractor;
25  import org.apache.any23.extractor.ExtractorDescription;
26  import org.apache.any23.extractor.ExtractorFactory;
27  import org.apache.any23.extractor.SimpleExtractorFactory;
28  import org.apache.any23.rdf.PopularPrefixes;
29  import org.apache.any23.rdf.RDFUtils;
30  import org.apache.any23.vocab.SINDICE;
31  import org.openrdf.model.URI;
32  import org.openrdf.model.impl.LiteralImpl;
33  import org.openrdf.model.impl.URIImpl;
34  import org.w3c.dom.Document;
35  import org.w3c.dom.NamedNodeMap;
36  import org.w3c.dom.Node;
37  
38  import java.io.IOException;
39  import java.util.Arrays;
40  import java.util.HashMap;
41  import java.util.HashSet;
42  import java.util.List;
43  import java.util.Map;
44  import java.util.Set;
45  
46  /**
47   * This extractor represents the <i>HTML META</i> tag values
48   * according the <a href="http://www.w3.org/TR/html401/struct/global.html#h-7.4.4">HTML4 specification</a>.
49   *
50   * @author Davide Palmisano ( dpalmisano@gmail.com )
51   */
52  public class HTMLMetaExtractor implements Extractor.TagSoupDOMExtractor {
53  
54      public static final String NAME = "html-head-meta";
55  
56      private static final SINDICE vSINDICE = SINDICE.getInstance();
57  
58      private URI profile;
59  
60      private Map<String, URI> prefixes = new HashMap<String, URI>();
61  
62      private String documentLang;
63  
64      public final static ExtractorFactory<HTMLMetaExtractor> factory =
65              SimpleExtractorFactory.create(
66                      NAME,
67                      PopularPrefixes.createSubset("sindice"),
68                      Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
69                      "example-meta.html",
70                      HTMLMetaExtractor.class
71              );
72  
73      /**
74       * {@inheritDoc}
75       */
76      public void run(
77              ExtractionParameters extractionParameters,
78              ExtractionContext extractionContext,
79              Document in,
80              ExtractionResult out
81      ) throws IOException, ExtractionException {
82          profile = extractProfile(in);
83          documentLang = getDocumentLanguage(in);
84          extractLinkDefinedPrefixes(in);
85  
86          String baseProfile = vSINDICE.NS;
87          if(profile != null) {
88              baseProfile = profile.toString();
89          }
90  
91          final URI documentURI = extractionContext.getDocumentURI();
92          Set<Meta> metas = extractMetaElement(in, baseProfile);
93          for(Meta meta : metas) {
94              String lang = documentLang;
95              if(meta.getLang() != null) {
96                  lang = meta.getLang();
97              }
98              out.writeTriple(
99                      documentURI,
100                     meta.getName(),
101                     new LiteralImpl(meta.getContent(), lang)
102             );
103         }
104     }
105 
106     /**
107      * Returns the {@link Document} language if declared, <code>null</code> otherwise.
108      *
109      * @param in a instance of {@link Document}.
110      * @return the language declared, could be <code>null</code>.
111      */
112     private String getDocumentLanguage(Document in) {
113         String lang = DomUtils.find(in, "string(/HTML/@lang)");
114         if (lang.equals("")) {
115             return null;
116         }
117         return lang;
118     }
119 
120     private URI extractProfile(Document in) {
121         String profile = DomUtils.find(in, "string(/HTML/@profile)");
122         if (profile.equals("")) {
123             return null;
124         }
125         return new URIImpl(profile);
126     }
127 
128     /**
129      * It extracts prefixes defined in the <i>LINK</i> meta tags.
130      *
131      * @param in
132      */
133     private void extractLinkDefinedPrefixes(Document in) {
134         List<Node> linkNodes = DomUtils.findAll(in, "/HTML/HEAD/LINK");
135         for(Node linkNode : linkNodes) {
136             NamedNodeMap attributes = linkNode.getAttributes();
137             String rel = attributes.getNamedItem("rel").getTextContent();
138             String href = attributes.getNamedItem("href").getTextContent();
139             if(rel != null && href !=null && RDFUtils.isAbsoluteURI(href)) {
140                 prefixes.put(rel, new URIImpl(href));
141             }
142         }
143     }
144 
145     private Set<Meta> extractMetaElement(Document in, String baseProfile) {
146         List<Node> metaNodes = DomUtils.findAll(in, "/HTML/HEAD/META");
147         Set<Meta> result = new HashSet<Meta>();
148         for (Node metaNode : metaNodes) {
149             NamedNodeMap attributes = metaNode.getAttributes();
150             Node nameAttribute = attributes.getNamedItem("name");
151             Node contentAttribute = attributes.getNamedItem("content");
152             if (nameAttribute == null || contentAttribute == null) {
153                 continue;
154             }
155             String name = nameAttribute.getTextContent();
156             String content = contentAttribute.getTextContent();
157             String xpath = DomUtils.getXPathForNode(metaNode);
158             URI nameAsURI = getPrefixIfExists(name);
159             if (nameAsURI == null) {
160                 nameAsURI = new URIImpl(baseProfile + name);
161             }
162             Meta meta = new Meta(xpath, nameAsURI, content);
163             result.add(meta);
164         }
165         return result;
166     }
167 
168     private URI getPrefixIfExists(String name) {
169         String[] split = name.split("\\.");
170         if(split.length == 2 && prefixes.containsKey(split[0])) {
171             return new URIImpl(prefixes.get(split[0]) + split[1]);
172         }
173         return null;
174     }
175 
176     public ExtractorDescription getDescription() {
177         return factory;
178     }
179 
180     private class Meta {
181 
182         private String xpath;
183 
184         private URI name;
185 
186         private String lang;
187 
188         private String content;
189 
190         public Meta(String xpath, URI name, String content) {
191             this.xpath = xpath;
192             this.name = name;
193             this.content = content;
194         }
195 
196         public Meta(String xpath, URI name, String content, String lang) {
197             this(xpath, name, content);
198             this.lang = lang;
199         }
200 
201         public URI getName() {
202             return name;
203         }
204 
205         public void setName(URI name) {
206             this.name = name;
207         }
208 
209         public String getLang() {
210             return lang;
211         }
212 
213         public void setLang(String lang) {
214             this.lang = lang;
215         }
216 
217         public String getContent() {
218             return content;
219         }
220 
221         public void setContent(String content) {
222             this.content = content;
223         }
224 
225         @Override
226         public boolean equals(Object o) {
227             if (this == o) return true;
228             if (o == null || getClass() != o.getClass()) return false;
229 
230             Meta meta = (Meta) o;
231 
232             if (xpath != null ? !xpath.equals(meta.xpath) : meta.xpath != null) return false;
233 
234             return true;
235         }
236 
237         @Override
238         public int hashCode() {
239             return xpath != null ? xpath.hashCode() : 0;
240         }
241     }
242 
243 }