001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      https://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018package org.apache.commons.codec.language;
019
020import org.apache.commons.codec.EncoderException;
021import org.apache.commons.codec.StringEncoder;
022
023/**
024 * Encodes a string into a Soundex value. Soundex is an encoding used to relate similar names, but can also be used as a
025 * general purpose scheme to find word with similar phonemes.
026 *
027 * <p>This class is thread-safe.
028 * Although not strictly immutable, the mutable fields are not actually used.</p>
029 */
030public class Soundex implements StringEncoder {
031
032    /**
033     * The marker character used to indicate a silent (ignored) character.
034     * These are ignored except when they appear as the first character.
035     * <p>
036     * Note: The {@link #US_ENGLISH_MAPPING_STRING} does not use this mechanism
037     * because changing it might break existing code. Mappings that don't contain
038     * a silent marker code are treated as though H and W are silent.
039     * </p>
040     * <p>
041     * To override this, use the {@link #Soundex(String, boolean)} constructor.
042     * </p>
043     *
044     * @since 1.11
045     */
046    public static final char SILENT_MARKER = '-';
047
048    /**
049     * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position
050     * means do not encode, but treat as a separator when it occurs between consonants with the same code.
051     * <p>
052     * (This constant is provided as both an implementation convenience and to allow Javadoc to pick
053     * up the value for the constant values page.)
054     * </p>
055     * <p>
056     * <strong>Note that letters H and W are treated specially.</strong>
057     * They are ignored (after the first letter) and don't act as separators
058     * between consonants with the same code.
059     * </p>
060     */
061    public static final String US_ENGLISH_MAPPING_STRING = "01230120022455012623010202";
062
063    /**
064     * This is a default mapping of the 26 letters used in US English. A value of {@code 0} for a letter position
065     * means do not encode.
066     *
067     * @see Soundex#Soundex(char[])
068     */
069    private static final char[] US_ENGLISH_MAPPING = US_ENGLISH_MAPPING_STRING.toCharArray();
070
071    /**
072     * An instance of Soundex using the US_ENGLISH_MAPPING mapping.
073     * This treats H and W as silent letters.
074     * Apart from when they appear as the first letter, they are ignored.
075     * They don't act as separators between duplicate codes.
076     *
077     * @see #US_ENGLISH_MAPPING_STRING
078     */
079    public static final Soundex US_ENGLISH = new Soundex();
080
081    /**
082     * An instance of Soundex using the Simplified Soundex mapping, as described here:
083     * http://west-penwith.org.uk/misc/soundex.htm
084     * <p>
085     * This treats H and W the same as vowels (AEIOUY).
086     * Such letters aren't encoded (after the first), but they do
087     * act as separators when dropping duplicate codes.
088     * The mapping is otherwise the same as for {@link #US_ENGLISH}
089     * </p>
090     *
091     * @since 1.11
092     */
093    public static final Soundex US_ENGLISH_SIMPLIFIED = new Soundex(US_ENGLISH_MAPPING_STRING, false);
094
095    /**
096     * An instance of Soundex using the mapping as per the Genealogy site:
097     * http://www.genealogy.com/articles/research/00000060.html
098     * <p>
099     * This treats vowels (AEIOUY), H and W as silent letters.
100     * Such letters are ignored (after the first) and do not
101     * act as separators when dropping duplicate codes.
102     * </p>
103     * <p>
104     * The codes for consonants are otherwise the same as for
105     * {@link #US_ENGLISH_MAPPING_STRING} and {@link #US_ENGLISH_SIMPLIFIED}
106     * </p>
107     *
108     * @since 1.11
109     */
110    public static final Soundex US_ENGLISH_GENEALOGY = new Soundex("-123-12--22455-12623-1-2-2");
111    //                                                              ABCDEFGHIJKLMNOPQRSTUVWXYZ
112
113    /**
114     * The maximum length of a Soundex code - Soundex codes are only four characters by definition.
115     *
116     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
117     */
118    @Deprecated
119    private int maxLength = 4;
120
121    /**
122     * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
123     * letter is mapped. This implementation contains a default map for US_ENGLISH
124     */
125    private final char[] soundexMapping;
126
127    /**
128     * Should H and W be treated specially?
129     * <p>
130     * In versions of the code prior to 1.11,
131     * the code always treated H and W as silent (ignored) letters.
132     * If this field is false, H and W are no longer special-cased.
133     * </p>
134     */
135    private final boolean specialCaseHW;
136
137    /**
138     * Creates an instance using US_ENGLISH_MAPPING.
139     *
140     * @see Soundex#Soundex(char[])
141     * @see Soundex#US_ENGLISH_MAPPING_STRING
142     */
143    public Soundex() {
144        this.soundexMapping = US_ENGLISH_MAPPING;
145        this.specialCaseHW = true;
146    }
147
148    /**
149     * Creates a Soundex instance using the given mapping. This constructor can be used to provide an internationalized
150     * mapping for a non-Western character set.
151     * <p>
152     * Every letter of the alphabet is "mapped" to a numerical value. This char array holds the values to which each
153     * letter is mapped. This implementation contains a default map for US_ENGLISH
154     * </p>
155     * <p>
156     * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment.
157     * </p>
158     *
159     * @param mapping
160     *                  Mapping array to use when finding the corresponding code for a given character.
161     */
162    public Soundex(final char[] mapping) {
163        this.soundexMapping = mapping.clone();
164        this.specialCaseHW = !hasMarker(this.soundexMapping);
165    }
166
167    /**
168     * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping,
169     * and/or possibly provide an internationalized mapping for a non-Western character set.
170     * <p>
171     * If the mapping contains an instance of {@link #SILENT_MARKER} then H and W are not given special treatment.
172     * </p>
173     *
174     * @param mapping
175     *            Mapping string to use when finding the corresponding code for a given character.
176     * @since 1.4
177     */
178    public Soundex(final String mapping) {
179        this.soundexMapping = mapping.toCharArray();
180        this.specialCaseHW = !hasMarker(this.soundexMapping);
181    }
182
183    /**
184     * Creates a refined Soundex instance using a custom mapping. This constructor can be used to customize the mapping,
185     * and/or possibly provide an internationalized mapping for a non-Western character set.
186     *
187     * @param mapping
188     *            Mapping string to use when finding the corresponding code for a given character.
189     * @param specialCaseHW if true, then
190     * @since 1.11
191     */
192    public Soundex(final String mapping, final boolean specialCaseHW) {
193        this.soundexMapping = mapping.toCharArray();
194        this.specialCaseHW = specialCaseHW;
195    }
196
197    /**
198     * Encodes the Strings and returns the number of characters in the two encoded Strings that are the same. This return value ranges from 0 through 4: 0
199     * indicates little or no similarity, and 4 indicates strong similarity or identical values.
200     *
201     * @param s1 A String that will be encoded and compared.
202     * @param s2 A String that will be encoded and compared.
203     * @return The number of characters in the two encoded Strings that are the same from 0 to 4.
204     * @see SoundexUtils#difference(StringEncoder,String,String)
205     * @see <a href="https://msdn.microsoft.com/library/default.asp?url=/library/en-us/tsqlref/ts_de-dz_8co5.asp"> MS T-SQL DIFFERENCE</a>
206     *
207     * @throws EncoderException if an error occurs encoding one of the strings.
208     * @since 1.3
209     */
210    public int difference(final String s1, final String s2) throws EncoderException {
211        return SoundexUtils.difference(this, s1, s2);
212    }
213
214    /**
215     * Encodes an Object using the Soundex algorithm. This method is provided in order to satisfy the requirements of the Encoder interface, and will throw an
216     * EncoderException if the supplied object is not of type {@link String}.
217     *
218     * @param obj Object to encode.
219     * @return An object (or type {@link String}) containing the Soundex code which corresponds to the String supplied.
220     * @throws EncoderException         if the parameter supplied is not of type {@link String}.
221     * @throws IllegalArgumentException if a character is not mapped.
222     */
223    @Override
224    public Object encode(final Object obj) throws EncoderException {
225        if (!(obj instanceof String)) {
226            throw new EncoderException("Parameter supplied to Soundex encode is not of type java.lang.String");
227        }
228        return soundex((String) obj);
229    }
230
231    /**
232     * Encodes a String using the Soundex algorithm.
233     *
234     * @param str A String object to encode.
235     * @return A Soundex code corresponding to the String supplied.
236     * @throws IllegalArgumentException if a character is not mapped.
237     */
238    @Override
239    public String encode(final String str) {
240        return soundex(str);
241    }
242
243    /**
244     * Returns the maxLength. Standard Soundex
245     *
246     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
247     * @return the maxLength.
248     */
249    @Deprecated
250    public int getMaxLength() {
251        return this.maxLength;
252    }
253
254    private boolean hasMarker(final char[] mapping) {
255        for (final char ch : mapping) {
256            if (ch == SILENT_MARKER) {
257                return true;
258            }
259        }
260        return false;
261    }
262
263    /**
264     * Maps the given upper-case character to its Soundex code.
265     *
266     * @param ch
267     *                  An upper-case character.
268     * @return A Soundex code.
269     * @throws IllegalArgumentException
270     *                  Thrown if {@code ch} is not mapped.
271     */
272    private char map(final char ch) {
273        final int index = ch - 'A';
274        if (index < 0 || index >= this.soundexMapping.length) {
275            throw new IllegalArgumentException("The character is not mapped: " + ch + " (index=" + index + ")");
276        }
277        return this.soundexMapping[index];
278    }
279
280    /**
281     * Sets the maxLength.
282     *
283     * @deprecated This feature is not needed since the encoding size must be constant. Will be removed in 2.0.
284     * @param maxLength
285     *                  The maxLength to set.
286     */
287    @Deprecated
288    public void setMaxLength(final int maxLength) {
289        this.maxLength = maxLength;
290    }
291
292    /**
293     * Retrieves the Soundex code for a given String object.
294     *
295     * @param str String to encode using the Soundex algorithm.
296     * @return A Soundex code for the String supplied.
297     * @throws IllegalArgumentException if a character is not mapped.
298     */
299    public String soundex(String str) {
300        if (str == null) {
301            return null;
302        }
303        str = SoundexUtils.clean(str);
304        if (str.isEmpty()) {
305            return str;
306        }
307        final char[] out = { '0', '0', '0', '0' };
308        int count = 0;
309        final char first = str.charAt(0);
310        out[count++] = first;
311        char lastDigit = map(first); // previous digit
312        for (int i = 1; i < str.length() && count < out.length; i++) {
313            final char ch = str.charAt(i);
314            if (this.specialCaseHW && (ch == 'H' || ch == 'W')) { // these are ignored completely
315                continue;
316            }
317            final char digit = map(ch);
318            if (digit == SILENT_MARKER) {
319                continue;
320            }
321            if (digit != '0' && digit != lastDigit) { // don't store vowels or repeats
322                out[count++] = digit;
323            }
324            lastDigit = digit;
325        }
326        return new String(out);
327    }
328
329}