View Javadoc

1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *  http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  package org.apache.any23.extractor.html;
19  
20  import org.w3c.dom.Node;
21  
22  import java.util.ArrayList;
23  import java.util.Arrays;
24  import java.util.Collection;
25  import java.util.Collections;
26  import java.util.HashMap;
27  import java.util.List;
28  import java.util.Map;
29  
30  import static org.apache.any23.extractor.html.HTMLDocument.TextField;
31  
32  /**
33   * An HCard name, consisting of various parts. Handles computation
34   * of full names from first and last names, and similar computations.
35   *
36   * @author Richard Cyganiak (richard@cyganiak.de)
37   */
38  public class HCardName {
39  
40      public static final String GIVEN_NAME = "given-name";
41      public static final String FAMILY_NAME = "family-name";
42      public static final String ADDITIONAL_NAME = "additional-name";
43      public static final String NICKNAME = "nickname";
44      public static final String HONORIFIC_PREFIX = "honorific-prefix";
45      public static final String HONORIFIC_SUFFIX = "honorific-suffix";
46  
47      public static final String[] FIELDS = {
48              GIVEN_NAME,
49              FAMILY_NAME,
50              ADDITIONAL_NAME,
51              NICKNAME,
52              HONORIFIC_PREFIX,
53              HONORIFIC_SUFFIX
54      };
55  
56      private static final String[] NAME_COMPONENTS = {
57              HONORIFIC_PREFIX,
58              GIVEN_NAME,
59              ADDITIONAL_NAME,
60              FAMILY_NAME,
61              HONORIFIC_SUFFIX
62      };
63  
64      private Map<String, FieldValue> fields = new HashMap<String, FieldValue>();
65      private TextField[] fullName   = null;
66      private TextField organization = null;
67      private TextField unit         = null;
68  
69      private static TextField join(TextField[] sarray, String delimiter) {
70          StringBuilder builder = new StringBuilder();
71          final int sarrayLengthMin2 =  sarray.length - 1;
72          for(int i = 0; i < sarray.length; i++) {
73              builder.append(sarray[i].value());
74              if( i < sarrayLengthMin2) {
75                  builder.append(delimiter);
76              }
77          }
78          return new TextField( builder.toString(), sarray[0].source() ) ;
79      }
80  
81      /**
82       * Resets the content of the HName fields.
83       */
84      public void reset() {
85          fields.clear();
86          fullName = null;
87          organization = null;
88          unit = null;
89      }
90  
91      public void setField(String fieldName, TextField nd) {
92          final String value = fixWhiteSpace( nd.value() );
93          if (value == null) return;
94          FieldValue fieldValue = fields.get(fieldName);
95          if(fieldValue == null) {
96              fieldValue = new FieldValue();
97              fields.put(fieldName, fieldValue);
98          }
99          fieldValue.addValue( new TextField(value, nd.source()) );
100     }
101 
102     public void setFullName(TextField nd) {
103         final String value = fixWhiteSpace( nd.value() );
104         if (value == null) return;
105         String[] split = value.split("\\s+");
106         // Supporting case: ['King,',  'Ryan'] that is converted to ['Ryan', 'King'] .
107         final String split0 = split[0];
108         final int split0Length = split0.length();
109         if(split.length > 1 && split0.charAt(split0Length -1) == ',') {
110             String swap = split[1];
111             split[1] = split0.substring(0, split0Length -1);
112             split[0] = swap;
113         }
114         TextField[] splitFields = new TextField[split.length];
115         for(int i = 0; i < split.length; i++) {
116             splitFields[i] = new TextField(split[i], nd.source());
117         }
118         this.fullName = splitFields;
119     }
120 
121     public void setOrganization(TextField nd) {
122         final String value = fixWhiteSpace( nd.value() );
123         if (value == null) return;
124         this.organization = new TextField(value, nd.source());
125     }
126 
127     public boolean isMultiField(String fieldName) {
128         FieldValue fieldValue = fields.get(fieldName);
129         return fieldValue != null && fieldValue.isMultiField();
130     }
131 
132     public boolean containsField(String fieldName) {
133         return GIVEN_NAME.equals(fieldName) || FAMILY_NAME.equals(fieldName) || fields.containsKey(fieldName);
134     }
135 
136     public TextField getField(String fieldName) {
137         if (GIVEN_NAME.equals(fieldName)) {
138             return getFullNamePart(GIVEN_NAME, 0);
139         }
140         if (FAMILY_NAME.equals(fieldName)) {
141             return getFullNamePart(FAMILY_NAME, Integer.MAX_VALUE);
142         }
143         FieldValue v = fields.get(fieldName);
144         return v == null ? null : v.getValue();
145     }
146 
147     public Collection<TextField> getFields(String fieldName) {
148         FieldValue v = fields.get(fieldName);
149         return v == null ? Collections.<TextField>emptyList() : v.getValues();
150     }
151 
152     private TextField getFullNamePart(String fieldName, int index) {
153         if (fields.containsKey(fieldName)) {
154             return fields.get(fieldName).getValue();
155         }
156         if (fullName == null) return null;
157         // If org and fn are the same, the hCard is for an organization, and we do not split the fn
158         if (organization != null && fullName[0].value().equals(organization.value())) {
159             return null;
160         }
161         if (index != Integer.MAX_VALUE && fullName.length <= index) return null;
162         return fullName[ index == Integer.MAX_VALUE ? fullName.length - 1 : index];
163     }
164 
165     public boolean hasField(String fieldName) {
166         return getField(fieldName) != null;
167     }
168 
169     public boolean hasAnyField() {
170         for (String fieldName : FIELDS) {
171             if (hasField(fieldName)) return true;
172         }
173         return false;
174     }
175 
176     public TextField getFullName() {
177         if (fullName != null) return join(fullName, " ");
178         StringBuffer s = new StringBuffer();
179         boolean empty = true;
180         Node first = null;
181         TextField current;
182         for (String fieldName : NAME_COMPONENTS) {
183             if (!hasField(fieldName)) continue;
184             if (!empty) {
185                 s.append(' ');
186             }
187             current = getField(fieldName);
188             if(first == null) { first = current.source(); }
189             s.append( current.value() );
190             empty = false;
191         }
192         if (empty) return null;
193         return new TextField( s.toString(), first);
194     }
195 
196     public TextField getOrganization() {
197         return organization;
198     }
199 
200     public void setOrganizationUnit(TextField nd) {
201         final String value = fixWhiteSpace( nd.value() );
202         if (value == null) return;
203         this.unit = new TextField(value, nd.source() );
204     }
205 
206     public TextField getOrganizationUnit() {
207         return unit;
208     }
209 
210     private String fixWhiteSpace(String s) {
211         if (s == null) return null;
212         s = s.trim().replaceAll("\\s+", " ");
213         if ("".equals(s)) return null;
214         return s;
215     }
216 
217     /**
218      * Represents a possible field value.
219      */
220     private class FieldValue {
221 
222         private TextField value;
223         private List<TextField> multiValue = new ArrayList<TextField>();
224 
225         FieldValue() {}
226 
227         void addValue(TextField v) {
228             if(value == null && multiValue == null) {
229                 value = v;
230             } else if(multiValue == null) {
231                 multiValue = new ArrayList<TextField>();
232                 multiValue.add(value);
233                 value = null;
234                 multiValue.add(v);
235             } else {
236                 multiValue.add(v);
237             }
238         }
239 
240         boolean isMultiField() {
241             return value == null;
242         }
243 
244         TextField getValue() {
245             return value != null ? value : multiValue.get(0);
246         }
247 
248         Collection<TextField> getValues() {
249             return value != null ? Arrays.asList(value) : multiValue;
250         }
251     }
252     
253 }