001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * https://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017 018package org.apache.commons.codec.language; 019 020import java.util.Locale; 021 022import org.apache.commons.codec.EncoderException; 023import org.apache.commons.codec.StringEncoder; 024 025/** 026 * Encodes a string into a Metaphone value. 027 * <p> 028 * Initial Java implementation by <CITE>William B. Brogden. December, 1997</CITE>. 029 * Permission given by <CITE>wbrogden</CITE> for code to be used anywhere. 030 * </p> 031 * <p> 032 * <CITE>Hanging on the Metaphone</CITE> by <CITE>Lawrence Philips</CITE> in <CITE>Computer Language of Dec. 1990, 033 * p 39.</CITE> 034 * </p> 035 * <p> 036 * Note, that this does not match the algorithm that ships with PHP, or the algorithm found in the Perl implementations: 037 * </p> 038 * <ul> 039 * <li><a href="https://search.cpan.org/~mschwern/Text-Metaphone-1.96/Metaphone.pm">Text:Metaphone-1.96</a> 040 * (broken link 4/30/2013) </li> 041 * <li><a href="https://metacpan.org/source/MSCHWERN/Text-Metaphone-1.96//Metaphone.pm">Text:Metaphone-1.96</a> 042 * (link checked 4/30/2013) </li> 043 * </ul> 044 * <p> 045 * They have had undocumented changes from the originally published algorithm. 046 * For more information, see <a href="https://issues.apache.org/jira/browse/CODEC-57">CODEC-57</a>. 047 * </p> 048 * <p> 049 * This class is conditionally thread-safe. 050 * The instance field for maximum code length is mutable {@link #setMaxCodeLen(int)} 051 * but is not volatile, and accesses are not synchronized. 052 * If an instance of the class is shared between threads, the caller needs to ensure that suitable synchronization 053 * is used to ensure safe publication of the value between threads, and must not invoke {@link #setMaxCodeLen(int)} 054 * after initial setup. 055 * </p> 056 */ 057public class Metaphone implements StringEncoder { 058 059 /** 060 * Five values in the English language. 061 */ 062 private static final String VOWELS = "AEIOU"; 063 064 /** 065 * Variable used in Metaphone algorithm. 066 */ 067 private static final String FRONTV = "EIY"; 068 069 /** 070 * Variable used in Metaphone algorithm. 071 */ 072 private static final String VARSON = "CSPTG"; 073 074 /** 075 * The max code length for Metaphone is 4. 076 */ 077 private int maxCodeLen = 4; 078 079 /** 080 * Constructs a new instance. 081 */ 082 public Metaphone() { 083 // empty 084 } 085 086 /** 087 * Encodes an Object using the Metaphone algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an 088 * EncoderException if the supplied object is not of type {@link String}. 089 * 090 * @param obj Object to encode. 091 * @return An object (or type {@link String}) containing the Metaphone code which corresponds to the String supplied. 092 * @throws EncoderException if the parameter supplied is not of type {@link String}. 093 */ 094 @Override 095 public Object encode(final Object obj) throws EncoderException { 096 if (!(obj instanceof String)) { 097 throw new EncoderException("Parameter supplied to Metaphone encode is not of type java.lang.String"); 098 } 099 return metaphone((String) obj); 100 } 101 102 /** 103 * Encodes a String using the Metaphone algorithm. 104 * 105 * @param str String object to encode 106 * @return The Metaphone code corresponding to the String supplied 107 */ 108 @Override 109 public String encode(final String str) { 110 return metaphone(str); 111 } 112 113 /** 114 * Gets the maxCodeLen. 115 * 116 * @return the maxCodeLen. 117 */ 118 public int getMaxCodeLen() { 119 return this.maxCodeLen; 120 } 121 122 private boolean isLastChar(final int wdsz, final int n) { 123 return n + 1 == wdsz; 124 } 125 126 /** 127 * Tests is the Metaphones of two strings are identical. 128 * 129 * @param str1 First of two strings to compare. 130 * @param str2 Second of two strings to compare. 131 * @return {@code true} if the Metaphones of these strings are identical, {@code false} otherwise. 132 */ 133 public boolean isMetaphoneEqual(final String str1, final String str2) { 134 return metaphone(str1).equals(metaphone(str2)); 135 } 136 137 private boolean isNextChar(final StringBuilder string, final int index, final char c) { 138 boolean matches = false; 139 if (index >= 0 && index < string.length() - 1) { 140 matches = string.charAt(index + 1) == c; 141 } 142 return matches; 143 } 144 145 private boolean isPreviousChar(final StringBuilder string, final int index, final char c) { 146 boolean matches = false; 147 if (index > 0 && index < string.length()) { 148 matches = string.charAt(index - 1) == c; 149 } 150 return matches; 151 } 152 153 private boolean isVowel(final StringBuilder string, final int index) { 154 return VOWELS.indexOf(string.charAt(index)) >= 0; 155 } 156 157 /** 158 * Find the Metaphone value of a String. This is similar to the 159 * Soundex algorithm, but better at finding similar sounding words. 160 * All input is converted to upper case. 161 * Limitations: Input format is expected to be a single ASCII word 162 * with only characters in the A - Z range, no punctuation or numbers. 163 * 164 * @param txt String to find the Metaphone code for. 165 * @return A Metaphone code corresponding to the String supplied. 166 */ 167 public String metaphone(final String txt) { 168 boolean hard = false; 169 final int txtLength; 170 if (txt == null || (txtLength = txt.length()) == 0) { 171 return ""; 172 } 173 // single character is itself 174 if (txtLength == 1) { 175 return txt.toUpperCase(Locale.ENGLISH); 176 } 177 178 final char[] inwd = txt.toUpperCase(Locale.ENGLISH).toCharArray(); 179 180 final StringBuilder local = new StringBuilder(40); // manipulate 181 final StringBuilder code = new StringBuilder(10); // output 182 // handle initial 2 characters exceptions 183 switch (inwd[0]) { 184 case 'K': 185 case 'G': 186 case 'P': /* looking for KN, etc */ 187 if (inwd[1] == 'N') { 188 local.append(inwd, 1, inwd.length - 1); 189 } else { 190 local.append(inwd); 191 } 192 break; 193 case 'A': /* looking for AE */ 194 if (inwd[1] == 'E') { 195 local.append(inwd, 1, inwd.length - 1); 196 } else { 197 local.append(inwd); 198 } 199 break; 200 case 'W': /* looking for WR or WH */ 201 if (inwd[1] == 'R') { // WR -> R 202 local.append(inwd, 1, inwd.length - 1); 203 break; 204 } 205 if (inwd[1] == 'H') { 206 local.append(inwd, 1, inwd.length - 1); 207 local.setCharAt(0, 'W'); // WH -> W 208 } else { 209 local.append(inwd); 210 } 211 break; 212 case 'X': /* initial X becomes S */ 213 inwd[0] = 'S'; 214 local.append(inwd); 215 break; 216 default: 217 local.append(inwd); 218 } // now local has working string with initials fixed 219 220 final int wdsz = local.length(); 221 int n = 0; 222 223 while (code.length() < getMaxCodeLen() && n < wdsz) { // max code size of 4 works well 224 final char symb = local.charAt(n); 225 // remove duplicate letters except C 226 if (symb == 'C' || !isPreviousChar(local, n, symb)) { 227 // not dup 228 switch (symb) { 229 case 'A': 230 case 'E': 231 case 'I': 232 case 'O': 233 case 'U': 234 if (n == 0) { 235 code.append(symb); 236 } 237 break; // only use vowel if leading char 238 case 'B': 239 if (isPreviousChar(local, n, 'M') && isLastChar(wdsz, n)) { // B is silent if word ends in MB 240 break; 241 } 242 code.append(symb); 243 break; 244 case 'C': // lots of C special cases 245 /* discard if SCI, SCE or SCY */ 246 if (isPreviousChar(local, n, 'S') && !isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 247 break; 248 } 249 if (regionMatch(local, n, "CIA")) { // "CIA" -> X 250 code.append('X'); 251 break; 252 } 253 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0) { 254 code.append('S'); 255 break; // CI,CE,CY -> S 256 } 257 if (isPreviousChar(local, n, 'S') && isNextChar(local, n, 'H')) { // SCH->sk 258 code.append('K'); 259 break; 260 } 261 if (!isNextChar(local, n, 'H') || n == 0 && wdsz >= 3 && isVowel(local, 2)) { // CH consonant -> K consonant 262 code.append('K'); 263 } else { 264 code.append('X'); // CHvowel -> X 265 } 266 break; 267 case 'D': 268 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'G') && FRONTV.indexOf(local.charAt(n + 2)) >= 0) { // DGE DGI DGY -> J 269 code.append('J'); 270 n += 2; 271 } else { 272 code.append('T'); 273 } 274 break; 275 case 'G': // GH silent at end or before consonant 276 if (isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H')) { 277 break; 278 } 279 if (!isLastChar(wdsz, n + 1) && isNextChar(local, n, 'H') && !isVowel(local, n + 2)) { 280 break; 281 } 282 if (n > 0 && (regionMatch(local, n, "GN") || regionMatch(local, n, "GNED"))) { 283 break; // silent G 284 } 285 // NOTE: Given that duplicated chars are removed, I don't see how this can ever be true 286 hard = isPreviousChar(local, n, 'G'); 287 if (!isLastChar(wdsz, n) && FRONTV.indexOf(local.charAt(n + 1)) >= 0 && !hard) { 288 code.append('J'); 289 } else { 290 code.append('K'); 291 } 292 break; 293 case 'H': 294 if (isLastChar(wdsz, n)) { 295 break; // terminal H 296 } 297 if (n > 0 && VARSON.indexOf(local.charAt(n - 1)) >= 0) { 298 break; 299 } 300 if (isVowel(local, n + 1)) { 301 code.append('H'); // Hvowel 302 } 303 break; 304 case 'F': 305 case 'J': 306 case 'L': 307 case 'M': 308 case 'N': 309 case 'R': 310 code.append(symb); 311 break; 312 case 'K': 313 if (n > 0) { // not initial 314 if (!isPreviousChar(local, n, 'C')) { 315 code.append(symb); 316 } 317 } else { 318 code.append(symb); // initial K 319 } 320 break; 321 case 'P': 322 if (isNextChar(local, n, 'H')) { 323 // PH -> F 324 code.append('F'); 325 } else { 326 code.append(symb); 327 } 328 break; 329 case 'Q': 330 code.append('K'); 331 break; 332 case 'S': 333 if (regionMatch(local, n, "SH") || regionMatch(local, n, "SIO") || regionMatch(local, n, "SIA")) { 334 code.append('X'); 335 } else { 336 code.append('S'); 337 } 338 break; 339 case 'T': 340 if (regionMatch(local, n, "TIA") || regionMatch(local, n, "TIO")) { 341 code.append('X'); 342 break; 343 } 344 if (regionMatch(local, n, "TCH")) { 345 // Silent if in "TCH" 346 break; 347 } 348 // substitute numeral 0 for TH (resembles theta after all) 349 if (regionMatch(local, n, "TH")) { 350 code.append('0'); 351 } else { 352 code.append('T'); 353 } 354 break; 355 case 'V': 356 code.append('F'); 357 break; 358 case 'W': 359 case 'Y': // silent if not followed by vowel 360 if (!isLastChar(wdsz, n) && isVowel(local, n + 1)) { 361 code.append(symb); 362 } 363 break; 364 case 'X': 365 code.append('K'); 366 code.append('S'); 367 break; 368 case 'Z': 369 code.append('S'); 370 break; 371 default: 372 // do nothing 373 break; 374 } // end switch 375 } // end else from symb != 'C' 376 n++; 377 if (code.length() > getMaxCodeLen()) { 378 code.setLength(getMaxCodeLen()); 379 } 380 } 381 return code.toString(); 382 } 383 384 private boolean regionMatch(final StringBuilder string, final int index, final String test) { 385 boolean matches = false; 386 if (index >= 0 && index + test.length() - 1 < string.length()) { 387 final String substring = string.substring(index, index + test.length()); 388 matches = substring.equals(test); 389 } 390 return matches; 391 } 392 393 /** 394 * Sets the maxCodeLen. 395 * 396 * @param maxCodeLen The maxCodeLen to set. 397 */ 398 public void setMaxCodeLen(final int maxCodeLen) { 399 this.maxCodeLen = maxCodeLen; 400 } 401 402}