View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.xpath;
19  
20  import org.apache.any23.extractor.ExtractionResult;
21  import org.apache.any23.extractor.html.DomUtils;
22  import org.eclipse.rdf4j.model.IRI;
23  import org.w3c.dom.Document;
24  
25  import java.util.ArrayList;
26  import java.util.HashMap;
27  import java.util.List;
28  import java.util.Map;
29  import java.util.regex.Pattern;
30  
31  /**
32   * Default implementation of {@link XPathExtractionRule}.
33   *
34   * @author Michele Mostarda (mostarda@fbk.eu)
35   */
36  public class TemplateXPathExtractionRuleImpl implements TemplateXPathExtractionRule {
37  
38      private final String name;
39  
40      private final String uriRegex;
41  
42      private final Pattern uriRegexPattern;
43  
44      private final List<Variable> variables;
45  
46      private final List<QuadTemplate> templates;
47  
48      public TemplateXPathExtractionRuleImpl(String name, String uriRegex) {
49          if(name == null) {
50              throw new NullPointerException("The rule name cannot be null.");
51          }
52  
53          this.name = name;
54          this.uriRegex = uriRegex;
55  
56          try {
57              uriRegexPattern = uriRegex != null ? Pattern.compile(uriRegex) : null;
58          } catch (Exception e) {
59              throw new IllegalArgumentException("Invalid value for uriRegex.", e);
60          }
61          variables = new ArrayList<Variable>();
62          templates = new ArrayList<QuadTemplate>();
63      }
64  
65      /**
66       * @return the regex pattern filtering the template pages.
67       */
68      public String getUriRegex() {
69          return uriRegex;
70      }
71  
72      public void add(Variable variable) {
73          checkVariableNameNotDeclared(variable.getName());
74          variables.add(variable);
75      }
76  
77      public boolean remove(Variable variable) {
78          return variables.remove(variable);
79      }
80  
81      public void add(QuadTemplate template) {
82          checkTemplateVariablesDeclared(template);
83          templates.add(template);
84      }
85  
86      public boolean remove(QuadTemplate template) {
87          return templates.remove(template);
88      }
89  
90      public String getName() {
91          return name;
92      }
93  
94      public boolean acceptIRI(IRI uri) {
95          if(uriRegexPattern == null) {
96              return true;
97          }
98          return uriRegexPattern.matcher(uri.stringValue()).find();
99      }
100 
101     public void process(Document in, ExtractionResult er) {
102         final Map<String,String> varValues = new HashMap<String, String>();
103         String value;
104         for(Variable variable : variables) {
105             value = DomUtils.find(in, variable.getxPath().toUpperCase());
106             varValues.put(variable.getName(), value);
107         }
108 
109         for(QuadTemplate template : templates) {
110             template.printOut(er, varValues);
111         }
112     }
113 
114     private boolean variableNameDeclared(String varName) {
115         for(Variable variable : variables) {
116             if(variable.getName().equals(varName)) {
117                 return true;
118             }
119         }
120         return false;
121     }
122 
123     private void checkVariableNameDeclared(String varName) {
124         if (!variableNameDeclared(varName)) {
125             throw new IllegalArgumentException(
126                     String.format("A variable with name '%s' was not declared.", varName)
127             );
128         }
129     }
130 
131     private void checkVariableNameNotDeclared(String varName) {
132         if (variableNameDeclared(varName)) {
133             throw new IllegalArgumentException(
134                     String.format("A variable with name '%s' is already declared.", varName)
135             );
136         }
137     }
138 
139     private void checkTemplateVariablesDeclared(QuadTemplate template) {
140         if( template.getSubject().isVar()   ) checkVariableNameDeclared( template.getSubject().getInternalValue() );
141         if( template.getPredicate().isVar() ) checkVariableNameDeclared( template.getPredicate().getInternalValue() );
142         if( template.getObject().isVar()    ) checkVariableNameDeclared( template.getObject().getInternalValue() );
143         if( template.getGraph() != null && template.getGraph().isVar() ) {
144             checkVariableNameDeclared( template.getGraph().getInternalValue() );
145         }
146     }
147 
148     @Override
149     public String toString() {
150         final StringBuilder sb = new StringBuilder();
151         sb.append('\n');
152         sb.append("name: ").append(name).append('\n');
153         sb.append("pattern: '").append(uriRegex).append("'").append('\n');
154 
155         sb.append("variables {\n");
156         for (Variable variable : variables) {
157             sb.append(variable.getName()).append(":").append(variable.getxPath()).append('\n');
158         }
159         sb.append("}\n");
160 
161         sb.append("templates {\n");
162         for (QuadTemplate template : templates) {
163             sb.append(template).append('\n');
164         }
165         sb.append("}\n");
166         return sb.toString();
167     }
168 }