1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18 package org.apache.any23.extractor.html;
19
20 import org.apache.any23.extractor.ExtractionContext;
21 import org.apache.any23.extractor.ExtractionException;
22 import org.apache.any23.extractor.ExtractionParameters;
23 import org.apache.any23.extractor.ExtractionResult;
24 import org.apache.any23.extractor.ExtractorDescription;
25 import org.apache.any23.extractor.ExtractorFactory;
26 import org.apache.any23.extractor.SimpleExtractorFactory;
27 import org.apache.any23.rdf.Any23ValueFactoryWrapper;
28 import org.apache.any23.rdf.PopularPrefixes;
29 import org.apache.any23.vocab.DCTERMS;
30 import org.apache.any23.extractor.Extractor.TagSoupDOMExtractor;
31 import org.openrdf.model.impl.ValueFactoryImpl;
32 import org.w3c.dom.Document;
33
34 import java.io.IOException;
35 import java.util.Arrays;
36
37
38
39
40
41
42
43 public class TitleExtractor implements TagSoupDOMExtractor {
44
45 public static final String NAME = "html-head-title";
46
47 private static final DCTERMS vDCTERMS = DCTERMS.getInstance();
48
49 public final static ExtractorFactory<TitleExtractor> factory =
50 SimpleExtractorFactory.create(
51 NAME,
52 PopularPrefixes.createSubset("dcterms"),
53 Arrays.asList("text/html;q=0.02", "application/xhtml+xml;q=0.02"),
54 "example-title.html",
55 TitleExtractor.class
56 );
57
58 public void run(
59 ExtractionParameters extractionParameters,
60 ExtractionContext extractionContext,
61 Document in,
62 ExtractionResult out
63 ) throws IOException, ExtractionException {
64 final Any23ValueFactoryWrapper valueFactory = new Any23ValueFactoryWrapper(
65 ValueFactoryImpl.getInstance(), out, extractionContext.getDefaultLanguage()
66 );
67
68 try {
69 String title = DomUtils.find(in, "/HTML/HEAD/TITLE/text()").trim();
70 if (title != null && (title.length() != 0)) {
71 out.writeTriple(extractionContext.getDocumentURI(), vDCTERMS.title, valueFactory.createLiteral(title));
72 }
73 } finally {
74 valueFactory.setIssueReport(null);
75 }
76 }
77
78 public ExtractorDescription getDescription() {
79 return factory;
80 }
81
82 }