View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor;
19  
20  import org.apache.any23.extractor.html.MicroformatExtractor;
21  import org.eclipse.rdf4j.model.BNode;
22  import org.eclipse.rdf4j.model.Resource;
23  
24  import java.util.Arrays;
25  import java.util.List;
26  import java.util.Locale;
27  
28  /**
29   * This interface models a specific {@link ExtractionResult} able to collect property roots generated by <i>HTML
30   * Microformat</i> extractions.
31   *
32   * @author Michele Mostarda (mostarda@fbk.eu)
33   */
34  public interface TagSoupExtractionResult extends ExtractionResult {
35  
36      /**
37       * Adds a root property to the extraction result, specifying also the <i>path</i> corresponding to the root of data
38       * which generated the property and the extractor responsible for such addition.
39       *
40       * @param path
41       *            the <i>path</i> from the document root to the local root of the data generating the property.
42       * @param root
43       *            the property root node.
44       * @param extractor
45       *            the extractor responsible of such extraction.
46       */
47      void addResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor);
48  
49      /**
50       * Returns all the collected property roots.
51       *
52       * @return an <b>unmodifiable</b> list of {@link TagSoupExtractionResult.ResourceRoot}s.
53       */
54      List<ResourceRoot> getResourceRoots();
55  
56      /**
57       * Adds a property path to the list of the extracted data.
58       *
59       * @param extractor
60       *            the identifier of the extractor responsible for retrieving such property.
61       * @param propertySubject
62       *            the subject of the property.
63       * @param property
64       *            the property IRI.
65       * @param object
66       *            the property object if any, <code>null</code> otherwise.
67       * @param path
68       *            the path of the <i>HTML</i> node from which the property literal has been extracted.
69       */
70      void addPropertyPath(Class<? extends MicroformatExtractor> extractor, Resource propertySubject, Resource property,
71              BNode object, String[] path);
72  
73      /**
74       * Returns all the collected property paths.
75       *
76       * @return a valid list of property paths.
77       */
78      List<PropertyPath> getPropertyPaths();
79  
80      /**
81       * Defines a property root object.
82       */
83      class ResourceRoot {
84          private String[] path;
85          private Resource root;
86          private Class<? extends MicroformatExtractor> extractor;
87  
88          public ResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor) {
89              if (path == null || path.length == 0) {
90                  throw new IllegalArgumentException(
91                          String.format(Locale.ROOT, "Invalid xpath: '%s'.", Arrays.toString(path)));
92              }
93              if (root == null) {
94                  throw new IllegalArgumentException("Invalid root, cannot be null.");
95              }
96              if (extractor == null) {
97                  throw new IllegalArgumentException("Invalid extractor, cannot ne null");
98              }
99              this.path = path;
100             this.root = root;
101             this.extractor = extractor;
102         }
103 
104         public String[] getPath() {
105             return path;
106         }
107 
108         public Resource getRoot() {
109             return root;
110         }
111 
112         public Class<? extends MicroformatExtractor> getExtractor() {
113             return extractor;
114         }
115 
116         @Override
117         public String toString() {
118             return String.format(Locale.ROOT, "%s-%s-%s %s", this.getClass().getCanonicalName(), Arrays.toString(path),
119                     root, extractor);
120         }
121     }
122 
123     /**
124      * Defines a property path object.
125      */
126     class PropertyPath {
127 
128         private Class<? extends MicroformatExtractor> extractor;
129         private String[] path;
130         private Resource subject;
131         private Resource property;
132         private BNode object;
133 
134         public PropertyPath(String[] path, Resource subject, Resource property, BNode object,
135                 Class<? extends MicroformatExtractor> extractor) {
136             if (path == null) {
137                 throw new NullPointerException("path cannot be null.");
138             }
139             if (subject == null) {
140                 throw new NullPointerException("subject cannot be null.");
141             }
142             if (property == null) {
143                 throw new NullPointerException("property cannot be null.");
144             }
145             if (extractor == null) {
146                 throw new NullPointerException("extractor cannot be null.");
147             }
148             this.path = path;
149             this.subject = subject;
150             this.property = property;
151             this.object = object;
152             this.extractor = extractor;
153         }
154 
155         public String[] getPath() {
156             return path;
157         }
158 
159         public Resource getSubject() {
160             return subject;
161         }
162 
163         public Resource getProperty() {
164             return property;
165         }
166 
167         public BNode getObject() {
168             return object;
169         }
170 
171         public Class<? extends MicroformatExtractor> getExtractor() {
172             return extractor;
173         }
174 
175         @Override
176         public String toString() {
177             return String.format(Locale.ROOT, "%s %s - %s - %s -- %s -->", this.getClass().getCanonicalName(),
178                     Arrays.toString(path), extractor, subject, property);
179         }
180     }
181 
182 }