View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor;
19  
20  import org.apache.any23.extractor.html.MicroformatExtractor;
21  import org.openrdf.model.BNode;
22  import org.openrdf.model.Resource;
23  
24  import java.util.Arrays;
25  import java.util.List;
26  
27  /**
28   * This interface models a specific {@link ExtractionResult}
29   * able to collect property roots generated by <i>HTML Microformat</i> extractions.
30   *
31   * @author Michele Mostarda (mostarda@fbk.eu)
32   */
33  public interface TagSoupExtractionResult extends ExtractionResult {
34  
35      /**
36       * Adds a root property to the extraction result, specifying also
37       * the <i>path</i> corresponding to the root of data which generated the property
38       * and the extractor responsible for such addition.
39       *
40       * @param path the <i>path</i> from the document root to the local root of the data generating the property.
41       * @param root the property root node.
42       * @param extractor the extractor responsible of such extraction.
43       */
44      void addResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor);
45  
46      /**
47       * Returns all the collected property roots.
48       *
49       * @return an <b>unmodifiable</b> list of
50       *         {@link TagSoupExtractionResult.ResourceRoot}s.
51       */
52      List<ResourceRoot> getResourceRoots();
53  
54      /**
55       * Adds a property path to the list of the extracted data.
56       *
57       * @param extractor the identifier of the extractor responsible for retrieving such property.
58       * @param propertySubject the subject of the property.
59       * @param property the property URI.
60       * @param object the property object if any, <code>null</code> otherwise.
61       * @param path the path of the <i>HTML</i> node from which the property literal has been extracted.
62       */
63      void addPropertyPath(
64              Class<? extends MicroformatExtractor> extractor,
65              Resource propertySubject,
66              Resource property,
67              BNode object,
68              String[] path
69      );
70  
71      /**
72       * Returns all the collected property paths.
73       *
74       * @return a valid list of property paths.
75       */
76      List<PropertyPath> getPropertyPaths();
77  
78      /**
79       * Defines a property root object.
80       */
81      class ResourceRoot {
82          private String[] path;
83          private Resource root;
84          private Class<? extends MicroformatExtractor>  extractor;
85  
86          public ResourceRoot(String[] path, Resource root, Class<? extends MicroformatExtractor> extractor) {
87              if(path == null || path.length == 0) {
88                  throw new IllegalArgumentException( String.format("Invalid xpath: '%s'.", Arrays.toString(path) ) );
89              }
90              if(root == null) {
91                  throw new IllegalArgumentException("Invalid root, cannot be null.");
92              }
93              if(extractor == null) {
94                  throw new IllegalArgumentException("Invalid extractor, cannot ne null");
95              }
96              this.path      = path;
97              this.root      = root;
98              this.extractor = extractor;
99          }
100 
101         public String[] getPath() {
102             return path;
103         }
104 
105         public Resource getRoot() {
106             return root;
107         }
108 
109         public Class<? extends MicroformatExtractor> getExtractor() {
110             return extractor;
111         }
112 
113         @Override
114         public String toString() {
115             return String.format(
116                     "%s-%s-%s %s",
117                     this.getClass().getCanonicalName(),
118                     Arrays.toString(path), 
119                     root,
120                     extractor
121             );
122         }
123     }
124 
125     /**
126      * Defines a property path object.
127      */
128     class PropertyPath {
129 
130         private Class<? extends MicroformatExtractor>  extractor;
131         private String[] path;
132         private Resource subject;
133         private Resource property;
134         private BNode    object;
135 
136         public PropertyPath(String[] path, Resource subject, Resource property, BNode object, Class<? extends MicroformatExtractor> extractor) {
137             if(path == null) {
138                 throw new NullPointerException("path cannot be null.");
139             }
140             if(subject == null) {
141                 throw new NullPointerException("subject cannot be null.");
142             }
143             if(property == null) {
144                 throw new NullPointerException("property cannot be null.");
145             }
146             if(extractor == null) {
147                 throw new NullPointerException("extractor cannot be null.");
148             }
149             this.path      = path;
150             this.subject   = subject;
151             this.property  = property;
152             this.object    = object;
153             this.extractor = extractor;
154         }
155 
156         public String[] getPath() {
157             return path;
158         }
159 
160         public Resource getSubject() {
161             return subject;
162         }
163 
164         public Resource getProperty() {
165             return property;
166         }
167 
168         public BNode getObject() {
169             return object;
170         }
171 
172         public Class<? extends MicroformatExtractor> getExtractor() {
173             return extractor;
174         }
175 
176         @Override
177          public String toString() {
178             return String.format(
179                     "%s %s - %s - %s -- %s -->",
180                     this.getClass().getCanonicalName(),
181                     Arrays.toString(path),
182                     extractor,
183                     subject,
184                     property
185             );
186         }
187     }
188 
189 }