View Javadoc

1   /**
2    * This file Copyright (c) 2009-2010 Magnolia International
3    * Ltd.  (http://www.magnolia-cms.com). All rights reserved.
4    *
5    *
6    * This file is dual-licensed under both the Magnolia
7    * Network Agreement and the GNU General Public License.
8    * You may elect to use one or the other of these licenses.
9    *
10   * This file is distributed in the hope that it will be
11   * useful, but AS-IS and WITHOUT ANY WARRANTY; without even the
12   * implied warranty of MERCHANTABILITY or FITNESS FOR A
13   * PARTICULAR PURPOSE, TITLE, or NONINFRINGEMENT.
14   * Redistribution, except as permitted by whichever of the GPL
15   * or MNA you select, is prohibited.
16   *
17   * 1. For the GPL license (GPL), you can redistribute and/or
18   * modify this file under the terms of the GNU General
19   * Public License, Version 3, as published by the Free Software
20   * Foundation.  You should have received a copy of the GNU
21   * General Public License, Version 3 along with this program;
22   * if not, write to the Free Software Foundation, Inc., 51
23   * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
24   *
25   * 2. For the Magnolia Network Agreement (MNA), this file
26   * and the accompanying materials are made available under the
27   * terms of the MNA which accompanies this distribution, and
28   * is available at http://www.magnolia-cms.com/mna.html
29   *
30   * Any modifications to this file must keep this entire header
31   * intact.
32   *
33   */
34  package info.magnolia.cms.util;
35  
36  import info.magnolia.cms.core.SystemProperty;
37  import info.magnolia.objectfactory.Components;
38  
39  import java.lang.reflect.InvocationTargetException;
40  import java.lang.reflect.Method;
41  
42  /**
43   * A wrapper around java.text.Normalizer and com.ibm.icu.text.Normalizer; uses the former if present, or none
44   * if none is present.
45   *
46   * <strong>note:</strong> if not running under Java >=6, and without ICU, this does nothing.
47   * If needed, one can use their own implementation, by setting the info.magnolia.cms.util.UnicodeNormalizer$Normalizer
48   * system property.
49   *
50   * @see java.text.Normalizer
51   * @see com.ibm.icu.text.Normalizer
52   * @see <a href="http://www.icu-project.org/">http://www.icu-project.org/</a> to get the ICU4J library.
53   * @see <a href="http://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms">http://en.wikipedia.org/wiki/Unicode_equivalence#Normal_forms</a> for more information.
54   *
55   * @author gjoseph
56   * @version $Revision: $ ($Author: $)
57   */
58  public class UnicodeNormalizer {
59      private static final org.slf4j.Logger log = org.slf4j.LoggerFactory.getLogger(UnicodeNormalizer.class);
60  
61      private static final String JAVA6_NORMALIZER_CLASS = "java.text.Normalizer";
62      private static final String JAVA6_FORMPARAM_CLASS = "java.text.Normalizer$Form";
63      private static final String ICU_NORMALIZER_CLASS = "com.ibm.icu.text.Normalizer";
64  
65      private static final Normalizer normalizer = Components.getSingleton(Normalizer.class);
66  
67      /**
68       * Normalizes the given String to the NFC form.
69       */
70      public static String normalizeNFC(String in) {
71          return normalizer.normalizeNFC(in);
72          /* if you're in dire need to debug:
73           try {
74              log.debug("not normalized: " + Arrays.toString(in.getBytes("UTF-8")) + " (" + in + ")");
75              String out = normalizer.normalizeNFC(in);
76              log.debug("    normalized: " + Arrays.toString(out.getBytes("UTF-8")) + " (" + out + ")");
77              return out;
78          } catch (UnsupportedEncodingException e) {
79              // do nothing
80          }
81          return in;
82          */
83      }
84  
85      /**
86       * Used to normalize a String.
87       */
88      public interface Normalizer {
89          String normalizeNFC(String in);
90      }
91  
92      /**
93       * This uses reflection, since we're still compiling with Java 5.
94       * This implementation could be externalized to a "java 6 only" module if needed.
95       */
96      public static final class Java6ReflectionNormalizer implements Normalizer {
97          private final Method normalize;
98          private final Object nfc;
99  
100         public Java6ReflectionNormalizer() {
101             try {
102                 final Class<?> normalizer = Class.forName(JAVA6_NORMALIZER_CLASS);
103                 final Class<?> form = Class.forName(JAVA6_FORMPARAM_CLASS);
104                 normalize = normalizer.getMethod("normalize", CharSequence.class, form);
105                 nfc = form.getField("NFC").get(null);
106             } catch (ClassNotFoundException e) {
107                 throw new RuntimeException(e);
108             } catch (IllegalAccessException e) {
109                 throw new RuntimeException(e);
110             } catch (NoSuchFieldException e) {
111                 throw new RuntimeException(e);
112             } catch (NoSuchMethodException e) {
113                 throw new RuntimeException(e);
114             }
115 
116         }
117 
118         public String normalizeNFC(String in) {
119             try {
120                 return (String) normalize.invoke(null, in, nfc);
121             } catch (IllegalAccessException e) {
122                 throw new RuntimeException(e);
123             } catch (InvocationTargetException e) {
124                 throw new RuntimeException(e);
125             }
126         }
127     }
128 
129     /**
130      * Uses {@link com.ibm.icu.text.Normalizer} to normalize the string.
131      */
132     public static final class ICUNormalizer implements UnicodeNormalizer.Normalizer {
133         public String normalizeNFC(String in) {
134             return com.ibm.icu.text.Normalizer.normalize(in, com.ibm.icu.text.Normalizer.NFC);
135         }
136     }
137 
138     /**
139      * Returns the original value unchanged.
140      */
141     public static final class NonNormalizer implements UnicodeNormalizer.Normalizer {
142         public String normalizeNFC(String in) {
143             return in;
144         }
145     }
146 
147     /**
148      * Tries to load the normalizer dynamically and respects the property {@link SystemProperty#MAGNOLIA_UTF8_ENABLED}.
149      */
150     public static final class AutoDetectNormalizer implements Normalizer {
151         private final Normalizer delegate;
152 
153         public AutoDetectNormalizer() {
154             Normalizer candidate;
155             if (!SystemProperty.getBooleanProperty(SystemProperty.MAGNOLIA_UTF8_ENABLED)) {
156                 candidate = new NonNormalizer();
157             } else {
158                 try {
159                     Class.forName(JAVA6_NORMALIZER_CLASS);
160                     candidate = new Java6ReflectionNormalizer();
161                     log.info("Running on Java 6, using {} for unicode form normalization.", candidate.getClass());
162                 } catch (ClassNotFoundException e) {
163                     log.warn("Not running on Java 6 ({} not found). Attempting to locate the ICU4J library.", JAVA6_NORMALIZER_CLASS);
164                     try {
165                         Class.forName(ICU_NORMALIZER_CLASS);
166                         candidate = new ICUNormalizer();
167                         log.info("ICU4J found, using {} for Unicode form normalization.", candidate.getClass());
168                     } catch (ClassNotFoundException e2) {
169                         log.warn("ICU4J not found ({} not found), Unicode will not be 100% supported; no Unicode form normalization available. If Java 6 is not an option, you can get the ICU4J library from http://www.icu-project.org/.", ICU_NORMALIZER_CLASS);
170                         candidate = new NonNormalizer();
171                     }
172                 }
173             }
174             this.delegate = candidate;
175         }
176 
177         public String normalizeNFC(String in) {
178             return delegate.normalizeNFC(in);
179         }
180     }
181 
182 }