1 /*
2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
8 *
9 * http://www.apache.org/licenses/LICENSE-2.0
10 *
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
16 */
17
18 package org.apache.any23.extractor;
19
20 import org.apache.any23.extractor.html.MicroformatExtractor;
21 import org.openrdf.model.BNode;
22 import org.openrdf.model.Resource;
23
24 import java.util.Arrays;
25 import java.util.List;
26
27 /**
28 * This interface models a specific {@link ExtractionResult}
29 * able to collect property roots generated by <i>HTML Microformat</i> extractions.
30 *
31 * @author Michele Mostarda (mostarda@fbk.eu)
32 */
33 public interface TagSoupExtractionResult extends ExtractionResult {
34
35 /**
36 * Adds a root property to the extraction result, specifying also
37 * the <i>path</i> corresponding to the root of data which generated the property
38 * and the extractor responsible for such addition.
39 *
40 * @param path the <i>path</i> from the document root to the local root of the data generating the property.
41 * @param root the property root node.
42 * @param extractor the extractor responsible of such extraction.
43 */
44 void addResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor);
45
46 /**
47 * Returns all the collected property roots.
48 *
49 * @return an <b>unmodifiable</b> list of
50 * {@link TagSoupExtractionResult.ResourceRoot}s.
51 */
52 List<ResourceRoot> getResourceRoots();
53
54 /**
55 * Adds a property path to the list of the extracted data.
56 *
57 * @param extractor the identifier of the extractor responsible for retrieving such property.
58 * @param propertySubject the subject of the property.
59 * @param property the property URI.
60 * @param object the property object if any, <code>null</code> otherwise.
61 * @param path the path of the <i>HTML</i> node from which the property literal has been extracted.
62 */
63 void addPropertyPath(
64 Class<? extends MicroformatExtractor> extractor,
65 Resource propertySubject,
66 Resource property,
67 BNode object,
68 String[] path
69 );
70
71 /**
72 * Returns all the collected property paths.
73 *
74 * @return a valid list of property paths.
75 */
76 List<PropertyPath> getPropertyPaths();
77
78 /**
79 * Defines a property root object.
80 */
81 class ResourceRoot {
82 private String[] path;
83 private Resource root;
84 private Class<? extends MicroformatExtractor> extractor;
85
86 public ResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor) {
87 if(path == null || path.length == 0) {
88 throw new IllegalArgumentException( String.format("Invalid xpath: '%s'.", Arrays.toString(path) ) );
89 }
90 if(root == null) {
91 throw new IllegalArgumentException("Invalid root, cannot be null.");
92 }
93 if(extractor == null) {
94 throw new IllegalArgumentException("Invalid extractor, cannot ne null");
95 }
96 this.path = path;
97 this.root = root;
98 this.extractor = extractor;
99 }
100
101 public String[] getPath() {
102 return path;
103 }
104
105 public Resource getRoot() {
106 return root;
107 }
108
109 public Class<? extends MicroformatExtractor> getExtractor() {
110 return extractor;
111 }
112
113 @Override
114 public String toString() {
115 return String.format(
116 "%s-%s-%s %s",
117 this.getClass().getCanonicalName(),
118 Arrays.toString(path),
119 root,
120 extractor
121 );
122 }
123 }
124
125 /**
126 * Defines a property path object.
127 */
128 class PropertyPath {
129
130 private Class<? extends MicroformatExtractor> extractor;
131 private String[] path;
132 private Resource subject;
133 private Resource property;
134 private BNode object;
135
136 public PropertyPath(String[] path, Resource subject, Resource property, BNode object, Class<? extends MicroformatExtractor> extractor) {
137 if(path == null) {
138 throw new NullPointerException("path cannot be null.");
139 }
140 if(subject == null) {
141 throw new NullPointerException("subject cannot be null.");
142 }
143 if(property == null) {
144 throw new NullPointerException("property cannot be null.");
145 }
146 if(extractor == null) {
147 throw new NullPointerException("extractor cannot be null.");
148 }
149 this.path = path;
150 this.subject = subject;
151 this.property = property;
152 this.object = object;
153 this.extractor = extractor;
154 }
155
156 public String[] getPath() {
157 return path;
158 }
159
160 public Resource getSubject() {
161 return subject;
162 }
163
164 public Resource getProperty() {
165 return property;
166 }
167
168 public BNode getObject() {
169 return object;
170 }
171
172 public Class<? extends MicroformatExtractor> getExtractor() {
173 return extractor;
174 }
175
176 @Override
177 public String toString() {
178 return String.format(
179 "%s %s - %s - %s -- %s -->",
180 this.getClass().getCanonicalName(),
181 Arrays.toString(path),
182 extractor,
183 subject,
184 property
185 );
186 }
187 }
188
189 }