View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.ExtractorDescription;
25  import org.apache.any23.extractor.ExtractorFactory;
26  import org.apache.any23.extractor.SimpleExtractorFactory;
27  import org.apache.any23.rdf.PopularPrefixes;
28  import org.apache.any23.vocab.XHTML;
29  import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
30  import org.openrdf.model.URI;
31  import org.openrdf.model.ValueFactory;
32  import org.openrdf.model.impl.ValueFactoryImpl;
33  import org.w3c.dom.Document;
34  import org.w3c.dom.Node;
35  
36  import java.io.IOException;
37  import java.util.Arrays;
38  import java.util.List;
39  
40  /**
41   * This {@link org.apache.any23.extractor.Extractor.TagSoupDOMExtractor} implementation
42   * retrieves the <code>LINK</code>s declared within the <code>HTML/HEAD</code> page header.
43   */
44  public class HeadLinkExtractor implements TagSoupDOMExtractor {
45  
46      public void run(
47              ExtractionParameters extractionParameters,
48              ExtractionContext extractionContext,
49              Document in,
50              ExtractionResult out
51      ) throws IOException, ExtractionException {
52          HTMLDocument html = new HTMLDocument(in);
53          ValueFactory vf = ValueFactoryImpl.getInstance();
54  
55          final List<Node> headLinkNodes = DomUtils.findAll(
56                  in,
57                  "/HTML/HEAD/LINK[(" +
58                          "@type='application/rdf+xml' or " +
59                          "@type='text/rdf' or " +
60                          "@type='application/x-turtle' or " +
61                          "@type='application/turtle' or " +
62                          "@type='text/turtle' or " +
63                          "@type='text/rdf+n3'" +
64                          ") and @href and @rel]"
65          );
66          for (Node node : headLinkNodes) {
67              final URI href = html.resolveURI(DomUtils.find(node, "@href"));
68              final String rel = DomUtils.find(node, "@rel");
69              out.writeTriple(
70                      extractionContext.getDocumentURI(),
71                      vf.createURI(XHTML.NS + rel),
72                      href
73              );
74              final String title = DomUtils.find(node, "@title");
75              if (title != null && !"".equals(title)) {
76                  out.writeTriple(
77                          href,
78                          factory.getPrefixes().expand("dcterms:title"),
79                          vf.createLiteral(title)
80                  );
81              }
82              final String type = DomUtils.find(node, "@type");
83              if (type != null && !"".equals(type)) {
84                  out.writeTriple(
85                          href,
86                          factory.getPrefixes().expand("dcterms:format"),
87                          vf.createLiteral(type)
88                  );
89              }
90          }
91      }
92  
93      public ExtractorDescription getDescription() {
94          return factory;
95      }
96  
97      public final static ExtractorFactory<HeadLinkExtractor> factory =
98              SimpleExtractorFactory.create(
99                      "html-head-links",
100                     PopularPrefixes.createSubset("xhtml", "dcterms"),
101                     Arrays.asList("text/html;q=0.05", "application/xhtml+xml;q=0.05"),
102                     "example-head-link.html",
103                     HeadLinkExtractor.class);
104 }