Fix https://bz.apache.org/bugzilla/show_bug.cgi?id=63235

markt-asf · markt-asf · commit 18547a48cba6 · 2019-03-11T11:04:19.000Z
Implement an alternative Charset caching strategy that does not load all
Charsets at Tomcat start. This reduces start time by ~30ms and does not,
based on the performance tests included in this commit, have a negative
impact on runtime look-ups.
It does require that the names of all supported Charsets are known to
Tomcat at compile time. The code has been tested with a range of JVMs
and a unit test is provided for testing new JVMs.
diff --git a/java/org/apache/tomcat/util/buf/B2CConverter.java b/java/org/apache/tomcat/util/buf/B2CConverter.java
@@ -25,9 +25,7 @@
 import java.nio.charset.CoderResult;
 import java.nio.charset.CodingErrorAction;
 import java.nio.charset.StandardCharsets;
-import java.util.HashMap;
 import java.util.Locale;
-import java.util.Map;
 
 import org.apache.tomcat.util.res.StringManager;
 
@@ -38,23 +36,14 @@ public class B2CConverter {
 
     private static final StringManager sm = StringManager.getManager(B2CConverter.class);
 
-    private static final Map<String, Charset> encodingToCharsetCache =
-            new HashMap<>();
-
-    // Protected so unit tests can use it
-    protected static final int LEFTOVER_SIZE = 9;
+    private static final CharsetCache charsetCache;
 
     static {
-        for (Charset charset: Charset.availableCharsets().values()) {
-            encodingToCharsetCache.put(
-                    charset.name().toLowerCase(Locale.ENGLISH), charset);
-            for (String alias : charset.aliases()) {
-                encodingToCharsetCache.put(
-                        alias.toLowerCase(Locale.ENGLISH), charset);
-            }
-        }
+        charsetCache = new CharsetCache();
     }
 
+    // Protected so unit tests can use it
+    protected static final int LEFTOVER_SIZE = 9;
 
     /**
      * Obtain the Charset for the given encoding
@@ -71,7 +60,7 @@ public static Charset getCharset(String enc) throws UnsupportedEncodingException
         // Encoding names should all be ASCII
         String lowerCaseEnc = enc.toLowerCase(Locale.ENGLISH);
 
-        Charset charset = encodingToCharsetCache.get(lowerCaseEnc);
+        Charset charset = charsetCache.getCharset(lowerCaseEnc);
 
         if (charset == null) {
             // Pre-population of the cache means this must be invalid
diff --git a/java/org/apache/tomcat/util/buf/CharsetCache.java b/java/org/apache/tomcat/util/buf/CharsetCache.java
@@ -0,0 +1,134 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.tomcat.util.buf;
+
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.util.Locale;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.ConcurrentMap;
+
+public class CharsetCache {
+
+    private static final String[] INITIAL_CHARSETS = new String[] { "iso-8859-1", "utf-8" };
+
+    /*
+     * Tested with:
+     *  - Oracle JDK 8 u192
+     *  - OpenJDK 13 EA 4
+     */
+    private static final String[] LAZY_CHARSETS = new String[] {
+            "big5", "big5-hkscs", "cesu-8", "euc-jp", "euc-kr", "gb18030", "gb2312", "gbk", "ibm-thai", "ibm00858",
+            "ibm01140", "ibm01141", "ibm01142", "ibm01143", "ibm01144", "ibm01145", "ibm01146", "ibm01147", "ibm01148",
+            "ibm01149", "ibm037", "ibm1026", "ibm1047", "ibm273", "ibm277", "ibm278", "ibm280", "ibm284", "ibm285",
+            "ibm290", "ibm297", "ibm420", "ibm424", "ibm437", "ibm500", "ibm775", "ibm850", "ibm852", "ibm855",
+            "ibm857", "ibm860", "ibm861", "ibm862", "ibm863", "ibm864", "ibm865", "ibm866", "ibm868", "ibm869",
+            "ibm870", "ibm871", "ibm918", "iso-2022-cn", "iso-2022-jp", "iso-2022-jp-2", "iso-2022-kr", "iso-8859-13",
+            "iso-8859-15", "iso-8859-2", "iso-8859-3", "iso-8859-4", "iso-8859-5", "iso-8859-6", "iso-8859-7",
+            "iso-8859-8", "iso-8859-9", "iso-8859-16", "jis_x0201", "jis_x0212-1990", "koi8-r", "koi8-u", "shift_jis",
+            "tis-620", "us-ascii", "utf-16", "utf-16be", "utf-16le", "utf-32", "utf-32be", "utf-32le", "x-utf-32be-bom",
+            "x-utf-32le-bom", "windows-1250", "windows-1251", "windows-1252", "windows-1253", "windows-1254",
+            "windows-1255", "windows-1256", "windows-1257", "windows-1258", "windows-31j", "x-big5-hkscs-2001",
+            "x-big5-solaris", "x-compound_text", "x-euc-tw", "x-ibm1006", "x-ibm1025", "x-ibm1046", "x-ibm1097",
+            "x-ibm1098", "x-ibm1112", "x-ibm1122", "x-ibm1123", "x-ibm1124", "x-ibm1129", "x-ibm1166", "x-ibm1364",
+            "x-ibm1381", "x-ibm1383", "x-ibm300", "x-ibm33722", "x-ibm737", "x-ibm833", "x-ibm834", "x-ibm856",
+            "x-ibm874", "x-ibm875", "x-ibm921", "x-ibm922", "x-ibm930", "x-ibm933", "x-ibm935", "x-ibm937", "x-ibm939",
+            "x-ibm942", "x-ibm942c", "x-ibm943", "x-ibm943c", "x-ibm948", "x-ibm949", "x-ibm949c", "x-ibm950",
+            "x-ibm964", "x-ibm970", "x-iscii91", "x-iso-2022-cn-cns", "x-iso-2022-cn-gb", "x-jis0208",
+            "x-jisautodetect", "x-johab", "x-ms932_0213", "x-ms950-hkscs", "x-ms950-hkscs-xp", "x-macarabic",
+            "x-maccentraleurope", "x-maccroatian", "x-maccyrillic", "x-macdingbat", "x-macgreek", "x-machebrew",
+            "x-maciceland", "x-macroman", "x-macromania", "x-macsymbol", "x-macthai", "x-macturkish", "x-macukraine",
+            "x-pck", "x-sjis_0213", "x-utf-16le-bom", "x-euc-jp-linux", "x-eucjp-open", "x-iso-8859-11", "x-mswin-936",
+            "x-windows-50220", "x-windows-50221", "x-windows-874", "x-windows-949", "x-windows-950",
+            "x-windows-iso2022jp"
+            };
+
+    private static final Charset DUMMY_CHARSET = new DummyCharset("Dummy",  null);
+
+    private ConcurrentMap<String,Charset> cache = new ConcurrentHashMap<>();
+
+    public CharsetCache() {
+        // Pre-populate the cache
+        for (String charsetName : INITIAL_CHARSETS) {
+            Charset charset = Charset.forName(charsetName);
+            addToCache(charsetName, charset);
+        }
+
+        for (String charsetName : LAZY_CHARSETS) {
+            addToCache(charsetName, DUMMY_CHARSET);
+        }
+    }
+
+
+    private void addToCache(String name, Charset charset) {
+        cache.put(name, charset);
+        for (String alias : charset.aliases()) {
+            cache.put(alias.toLowerCase(Locale.ENGLISH), charset);
+        }
+    }
+
+
+    public Charset getCharset(String charsetName) {
+        String lcCharsetName = charsetName.toLowerCase(Locale.ENGLISH);
+
+        Charset result = cache.get(lcCharsetName);
+
+        if (result == DUMMY_CHARSET) {
+            // Name is known but the Charset is not in the cache
+            Charset charset = Charset.forName(lcCharsetName);
+            if (charset == null) {
+                // Charset not available in this JVM - remove cache entry
+                cache.remove(lcCharsetName);
+                result = null;
+            } else {
+                // Charset is available - populate cache entry
+                addToCache(lcCharsetName, charset);
+                result = charset;
+            }
+        }
+
+        return result;
+    }
+
+
+    /*
+     * Placeholder Charset implementation for entries that will be loaded lazily
+     * into the cache.
+     */
+    private static class DummyCharset extends Charset {
+
+        protected DummyCharset(String canonicalName, String[] aliases) {
+            super(canonicalName, aliases);
+        }
+
+        @Override
+        public boolean contains(Charset cs) {
+            return false;
+        }
+
+        @Override
+        public CharsetDecoder newDecoder() {
+            return null;
+        }
+
+        @Override
+        public CharsetEncoder newEncoder() {
+            return null;
+        }
+    }
+}
diff --git a/test/org/apache/tomcat/util/buf/TestCharsetCache.java b/test/org/apache/tomcat/util/buf/TestCharsetCache.java
@@ -0,0 +1,65 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.tomcat.util.buf;
+
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class TestCharsetCache {
+
+    @Test
+    public void testAllKnownCharsets() {
+        CharsetCache cache = new CharsetCache();
+
+        List<String> cacheMisses = new ArrayList<>();
+
+        for (Charset charset: Charset.availableCharsets().values()) {
+            if (cache.getCharset(charset.name()) == null) {
+                cacheMisses.add(charset.name());
+            } else {
+                for (String alias : charset.aliases()) {
+                    if (cache.getCharset(alias) == null) {
+                        cacheMisses.add(alias);
+                    }
+                }
+            }
+        }
+
+        if (cacheMisses.size() != 0) {
+            StringBuilder sb = new StringBuilder();
+            Collections.sort(cacheMisses);
+            for (String name : cacheMisses) {
+                if (sb.length() == 0) {
+                    sb.append('"');
+                } else {
+                    sb.append(", \"");
+                }
+                sb.append(name.toLowerCase(Locale.ENGLISH));
+                sb.append('"');
+            }
+            System.out.println(sb.toString());
+        }
+
+        Assert.assertTrue(cacheMisses.size() == 0);
+    }
+}
diff --git a/test/org/apache/tomcat/util/buf/TestCharsetCachePerformance.java b/test/org/apache/tomcat/util/buf/TestCharsetCachePerformance.java
@@ -0,0 +1,141 @@
+/*
+ *  Licensed to the Apache Software Foundation (ASF) under one or more
+ *  contributor license agreements.  See the NOTICE file distributed with
+ *  this work for additional information regarding copyright ownership.
+ *  The ASF licenses this file to You under the Apache License, Version 2.0
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+package org.apache.tomcat.util.buf;
+
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+
+import org.junit.Test;
+
+public class TestCharsetCachePerformance {
+
+    @Test
+    public void testNoCsCache() throws Exception {
+        doTest(new NoCsCache());
+    }
+
+
+    @Test
+    public void testFullCsCache() throws Exception {
+        doTest(new FullCsCache());
+    }
+
+
+    @Test
+    public void testLazyCsCache() throws Exception {
+        doTest(new LazyCsCache());
+    }
+
+
+    private void doTest(CsCache cache) throws Exception {
+        int threadCount = 10;
+        int iterations = 10000000;
+        String[] lookupNames = new String[] {
+                "ISO-8859-1", "ISO-8859-2", "ISO-8859-3", "ISO-8859-4", "ISO-8859-5" };
+
+        Thread[] threads = new Thread[threadCount];
+
+        for (int i = 0; i < threadCount; i++) {
+            threads[i] = new TestCsCacheThread(iterations, cache, lookupNames);
+        }
+
+        long startTime = System.nanoTime();
+
+        for (int i = 0; i < threadCount; i++) {
+            threads[i].start();
+        }
+
+        for (int i = 0; i < threadCount; i++) {
+            threads[i].join();
+        }
+
+        long endTime = System.nanoTime();
+
+        System.out.println(cache.getClass().getName() + ": " + (endTime - startTime) + "ns");
+    }
+
+
+    private static interface CsCache {
+        Charset getCharset(String charsetName);
+    }
+
+
+    private static class NoCsCache implements CsCache {
+
+        @Override
+        public Charset getCharset(String charsetName) {
+            return Charset.forName(charsetName);
+        }
+    }
+
+
+    private static class FullCsCache implements CsCache {
+
+        private static final Map<String,Charset> cache = new HashMap<>();
+
+        static {
+            for (Charset charset: Charset.availableCharsets().values()) {
+                cache.put(charset.name().toLowerCase(Locale.ENGLISH), charset);
+                for (String alias : charset.aliases()) {
+                    cache.put(alias.toLowerCase(Locale.ENGLISH), charset);
+                }
+            }
+        }
+
+
+        @Override
+        public Charset getCharset(String charsetName) {
+            return cache.get(charsetName.toLowerCase(Locale.ENGLISH));
+        }
+    }
+
+
+    private static class LazyCsCache implements CsCache {
+
+        private CharsetCache cache = new CharsetCache();
+
+        @Override
+        public Charset getCharset(String charsetName) {
+            return cache.getCharset(charsetName);
+        }
+    }
+
+
+    private static class TestCsCacheThread extends Thread {
+
+        private final int iterations;
+        private final CsCache cache;
+        private final String[] lookupNames;
+        private final int lookupNamesCount;
+
+        public TestCsCacheThread(int iterations, CsCache cache, String[] lookupNames) {
+            this.iterations = iterations;
+            this.cache = cache;
+            this.lookupNames = lookupNames;
+            this.lookupNamesCount = lookupNames.length;
+        }
+
+        @Override
+        public void run() {
+            for (int i = 0; i < iterations; i++) {
+                cache.getCharset(lookupNames[i % lookupNamesCount]);
+            }
+        }
+    }
+}
diff --git a/webapps/docs/changelog.xml b/webapps/docs/changelog.xml
@@ -92,6 +92,9 @@
         <bug>63246</bug>: Fix a potential <code>NullPointerException</code> when
         calling <code>AsyncContext.dispatch()</code>. (markt)
       </fix>
+      <fix>
+        <bug>63235</bug>: Refactor Charset cache to reduce start time. (markt)
+      </fix>
     </changelog>
   </subsection>
   <subsection name="Coyote">