View Javadoc
1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.xpath;
19  
20  import org.apache.any23.extractor.ExtractionResult;
21  import org.apache.any23.extractor.html.DomUtils;
22  import org.eclipse.rdf4j.model.IRI;
23  import org.w3c.dom.Document;
24  
25  import java.util.ArrayList;
26  import java.util.HashMap;
27  import java.util.List;
28  import java.util.Locale;
29  import java.util.Map;
30  import java.util.regex.Pattern;
31  
32  /**
33   * Default implementation of {@link XPathExtractionRule}.
34   *
35   * @author Michele Mostarda (mostarda@fbk.eu)
36   */
37  public class TemplateXPathExtractionRuleImpl implements TemplateXPathExtractionRule {
38  
39      private final String name;
40  
41      private final String uriRegex;
42  
43      private final Pattern uriRegexPattern;
44  
45      private final List<Variable> variables;
46  
47      private final List<QuadTemplate> templates;
48  
49      public TemplateXPathExtractionRuleImpl(String name, String uriRegex) {
50          if (name == null) {
51              throw new NullPointerException("The rule name cannot be null.");
52          }
53  
54          this.name = name;
55          this.uriRegex = uriRegex;
56  
57          try {
58              uriRegexPattern = uriRegex != null ? Pattern.compile(uriRegex) : null;
59          } catch (Exception e) {
60              throw new IllegalArgumentException("Invalid value for uriRegex.", e);
61          }
62          variables = new ArrayList<Variable>();
63          templates = new ArrayList<QuadTemplate>();
64      }
65  
66      /**
67       * @return the regex pattern filtering the template pages.
68       */
69      public String getUriRegex() {
70          return uriRegex;
71      }
72  
73      public void add(Variable variable) {
74          checkVariableNameNotDeclared(variable.getName());
75          variables.add(variable);
76      }
77  
78      public boolean remove(Variable variable) {
79          return variables.remove(variable);
80      }
81  
82      public void add(QuadTemplate template) {
83          checkTemplateVariablesDeclared(template);
84          templates.add(template);
85      }
86  
87      public boolean remove(QuadTemplate template) {
88          return templates.remove(template);
89      }
90  
91      public String getName() {
92          return name;
93      }
94  
95      public boolean acceptIRI(IRI uri) {
96          if (uriRegexPattern == null) {
97              return true;
98          }
99          return uriRegexPattern.matcher(uri.stringValue()).find();
100     }
101 
102     public void process(Document in, ExtractionResult er) {
103         final Map<String, String> varValues = new HashMap<String, String>();
104         String value;
105         for (Variable variable : variables) {
106             value = DomUtils.find(in, variable.getxPath().toUpperCase(Locale.ROOT));
107             varValues.put(variable.getName(), value);
108         }
109 
110         for (QuadTemplate template : templates) {
111             template.printOut(er, varValues);
112         }
113     }
114 
115     private boolean variableNameDeclared(String varName) {
116         for (Variable variable : variables) {
117             if (variable.getName().equals(varName)) {
118                 return true;
119             }
120         }
121         return false;
122     }
123 
124     private void checkVariableNameDeclared(String varName) {
125         if (!variableNameDeclared(varName)) {
126             throw new IllegalArgumentException(
127                     String.format(Locale.ROOT, "A variable with name '%s' was not declared.", varName));
128         }
129     }
130 
131     private void checkVariableNameNotDeclared(String varName) {
132         if (variableNameDeclared(varName)) {
133             throw new IllegalArgumentException(
134                     String.format(Locale.ROOT, "A variable with name '%s' is already declared.", varName));
135         }
136     }
137 
138     private void checkTemplateVariablesDeclared(QuadTemplate template) {
139         if (template.getSubject().isVar())
140             checkVariableNameDeclared(template.getSubject().getInternalValue());
141         if (template.getPredicate().isVar())
142             checkVariableNameDeclared(template.getPredicate().getInternalValue());
143         if (template.getObject().isVar())
144             checkVariableNameDeclared(template.getObject().getInternalValue());
145         if (template.getGraph() != null && template.getGraph().isVar()) {
146             checkVariableNameDeclared(template.getGraph().getInternalValue());
147         }
148     }
149 
150     @Override
151     public String toString() {
152         final StringBuilder sb = new StringBuilder();
153         sb.append('\n');
154         sb.append("name: ").append(name).append('\n');
155         sb.append("pattern: '").append(uriRegex).append("'").append('\n');
156 
157         sb.append("variables {\n");
158         for (Variable variable : variables) {
159             sb.append(variable.getName()).append(":").append(variable.getxPath()).append('\n');
160         }
161         sb.append("}\n");
162 
163         sb.append("templates {\n");
164         for (QuadTemplate template : templates) {
165             sb.append(template).append('\n');
166         }
167         sb.append("}\n");
168         return sb.toString();
169     }
170 }