View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.apache.any23.extractor.ExtractionContext;
21  import org.apache.any23.extractor.ExtractionException;
22  import org.apache.any23.extractor.ExtractionParameters;
23  import org.apache.any23.extractor.ExtractionResult;
24  import org.apache.any23.extractor.ExtractorDescription;
25  import org.apache.any23.extractor.ExtractorFactory;
26  import org.apache.any23.extractor.SimpleExtractorFactory;
27  import org.apache.any23.rdf.Any23ValueFactoryWrapper;
28  import org.apache.any23.rdf.PopularPrefixes;
29  import org.apache.any23.vocab.DCTERMS;
30  import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
31  import org.openrdf.model.impl.ValueFactoryImpl;
32  import org.w3c.dom.Document;
33  
34  import java.io.IOException;
35  import java.util.Arrays;
36  
37  /**
38   * Extracts the value of the <title> element of an
39   * HTML or XHTML page.
40   *
41   * @author Richard Cyganiak (richard@cyganiak.de)
42   */
43  public class TitleExtractor implements TagSoupDOMExtractor {
44  
45      public static final String NAME = "html-head-title";
46  
47      private static final DCTERMS vDCTERMS = DCTERMS.getInstance();
48  
49      public final static ExtractorFactory<TitleExtractor> factory =
50              SimpleExtractorFactory.create(
51                      NAME,
52                      PopularPrefixes.createSubset("dcterms"),
53                      Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
54                      "example-title.html",
55                      TitleExtractor.class
56              );
57  
58      public void run(
59              ExtractionParameters extractionParameters,
60              ExtractionContext extractionContext,
61              Document in,
62              ExtractionResult out
63      ) throws IOException, ExtractionException {
64          final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
65              ValueFactoryImpl.getInstance(), out, extractionContext.getDefaultLanguage()
66          );
67          
68          try {
69              String title = DomUtils.find(in, "/HTML/HEAD/TITLE/text()").trim();
70              if (title != null && (title.length() != 0)) {
71                  out.writeTriple(extractionContext.getDocumentURI(), vDCTERMS.title, valueFactory.createLiteral(title));
72              }
73          } finally {
74              valueFactory.setIssueReport(null);
75          }
76      }
77  
78      public ExtractorDescription getDescription() {
79          return factory;
80      }
81      
82  }