diff options
author | sherman <none@none> | 2008-08-22 14:37:46 -0700 |
---|---|---|
committer | sherman <none@none> | 2008-08-22 14:37:46 -0700 |
commit | 713b02015dfeb6ab332c2b24c3d30dd33212c7ae (patch) | |
tree | 3003191b2415185cd1c32960266fb031f9cf3756 /test/sun/nio | |
parent | 396db6c1c024b05c322d545fc0eef1927942adf6 (diff) |
4486841: UTF-8 decoder should adhere to corrigendum to Unicode 3.0.1
6636317: Optimize UTF-8 coder for ASCII input
Summary: re-write the UTF-8 charset to obey the standard and improve the performance
Reviewed-by: alanb
Diffstat (limited to 'test/sun/nio')
-rw-r--r-- | test/sun/nio/cs/TestUTF8.java | 393 |
1 files changed, 393 insertions, 0 deletions
diff --git a/test/sun/nio/cs/TestUTF8.java b/test/sun/nio/cs/TestUTF8.java new file mode 100644 index 000000000..967054573 --- /dev/null +++ b/test/sun/nio/cs/TestUTF8.java @@ -0,0 +1,393 @@ +/* + * Copyright 2008 Sun Microsystems, Inc. All Rights Reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + */ + +/* + * @test + * @bug 4486841 + * @summary Test UTF-8 charset + */ + +import java.nio.charset.*; +import java.nio.*; +import java.util.*; + +public class TestUTF8 { + static char[] decode(byte[] bb, String csn, boolean testDirect) + throws Exception { + CharsetDecoder dec = Charset.forName(csn).newDecoder(); + ByteBuffer bbf; + CharBuffer cbf; + if (testDirect) { + bbf = ByteBuffer.allocateDirect(bb.length); + cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer(); + bbf.put(bb).flip(); + } else { + bbf = ByteBuffer.wrap(bb); + cbf = CharBuffer.allocate(bb.length); + } + CoderResult cr = dec.decode(bbf, cbf, true); + if (cr != CoderResult.UNDERFLOW) + throw new RuntimeException("Decoding err: " + csn); + char[] cc = new char[cbf.position()]; + cbf.flip(); cbf.get(cc); + return cc; + + } + + static CoderResult decodeCR(byte[] bb, String csn, boolean testDirect) + throws Exception { + CharsetDecoder dec = Charset.forName(csn).newDecoder(); + ByteBuffer bbf; + CharBuffer cbf; + if (testDirect) { + bbf = ByteBuffer.allocateDirect(bb.length); + cbf = ByteBuffer.allocateDirect(bb.length*2).asCharBuffer(); + bbf.put(bb).flip(); + } else { + bbf = ByteBuffer.wrap(bb); + cbf = CharBuffer.allocate(bb.length); + } + return dec.decode(bbf, cbf, true); + } + + static byte[] encode(char[] cc, String csn, boolean testDirect) + throws Exception { + ByteBuffer bbf; + CharBuffer cbf; + CharsetEncoder enc = Charset.forName(csn).newEncoder(); + if (testDirect) { + bbf = ByteBuffer.allocateDirect(cc.length * 4); + cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); + cbf.put(cc).flip(); + } else { + bbf = ByteBuffer.allocate(cc.length * 4); + cbf = CharBuffer.wrap(cc); + } + + CoderResult cr = enc.encode(cbf, bbf, true); + if (cr != CoderResult.UNDERFLOW) + throw new RuntimeException("Encoding err: " + csn); + byte[] bb = new byte[bbf.position()]; + bbf.flip(); bbf.get(bb); + return bb; + } + + static CoderResult encodeCR(char[] cc, String csn, boolean testDirect) + throws Exception { + ByteBuffer bbf; + CharBuffer cbf; + CharsetEncoder enc = Charset.forName(csn).newEncoder(); + if (testDirect) { + bbf = ByteBuffer.allocateDirect(cc.length * 4); + cbf = ByteBuffer.allocateDirect(cc.length * 2).asCharBuffer(); + cbf.put(cc).flip(); + } else { + bbf = ByteBuffer.allocate(cc.length * 4); + cbf = CharBuffer.wrap(cc); + } + return enc.encode(cbf, bbf, true); + } + + static char[] getUTFChars() { + char[] cc = new char[0x10000 - 0xe000 + 0xd800 + //bmp + (0x110000 - 0x10000) * 2]; //supp + int pos = 0; + int i = 0; + for (i = 0; i < 0xd800; i++) + cc[pos++] = (char)i; + for (i = 0xe000; i < 0x10000; i++) + cc[pos++] = (char)i; + for (i = 0x10000; i < 0x110000; i++) { + pos += Character.toChars(i, cc, pos); + } + return cc; + } + + static int to3ByteUTF8(char c, byte[] bb, int pos) { + bb[pos++] = (byte)(0xe0 | ((c >> 12))); + bb[pos++] = (byte)(0x80 | ((c >> 06) & 0x3f)); + bb[pos++] = (byte)(0x80 | ((c >> 00) & 0x3f)); + return 3; + } + + static void checkRoundtrip(String csn) throws Exception { + System.out.printf(" Check roundtrip <%s>...", csn); + char[] cc = getUTFChars(); + byte[] bb = encode(cc, csn, false); + char[] ccO = decode(bb, csn, false); + + if (!Arrays.equals(cc, ccO)) { + System.out.printf(" non-direct failed"); + } + bb = encode(cc, csn, true); + ccO = decode(bb, csn, true); + if (!Arrays.equals(cc, ccO)) { + System.out.printf(" (direct) failed"); + } + System.out.println(); + } + + static void check6ByteSurrs(String csn) throws Exception { + System.out.printf(" Check 6-byte Surrogates <%s>...%n", csn); + byte[] bb = new byte[(0x110000 - 0x10000) * 6]; + char[] cc = new char[(0x110000 - 0x10000) * 2]; + int bpos = 0; + int cpos = 0; + for (int i = 0x10000; i < 0x110000; i++) { + Character.toChars(i, cc, cpos); + bpos += to3ByteUTF8(cc[cpos], bb, bpos); + bpos += to3ByteUTF8(cc[cpos + 1], bb, bpos); + cpos += 2; + } + + char[] ccO = decode(bb, csn, false); + if (!Arrays.equals(cc, ccO)) { + System.out.printf(" decoding failed%n"); + } + ccO = decode(bb, csn, true); + if (!Arrays.equals(cc, ccO)) { + System.out.printf(" decoding(direct) failed%n"); + } + } + + static void compare(String csn1, String csn2) throws Exception { + System.out.printf(" Diff <%s> <%s>...%n", csn1, csn2); + char[] cc = getUTFChars(); + + byte[] bb1 = encode(cc, csn1, false); + byte[] bb2 = encode(cc, csn2, false); + if (!Arrays.equals(bb1, bb2)) + System.out.printf(" encoding failed%n"); + char[] cc1 = decode(bb1, csn1, false); + char[] cc2 = decode(bb1, csn2, false); + if (!Arrays.equals(cc1, cc2)) { + System.out.printf(" decoding failed%n"); + } + + bb1 = encode(cc, csn1, true); + bb2 = encode(cc, csn2, true); + if (!Arrays.equals(bb1, bb2)) + System.out.printf(" encoding (direct) failed%n"); + cc1 = decode(bb1, csn1, true); + cc2 = decode(bb1, csn2, true); + if (!Arrays.equals(cc1, cc2)) { + System.out.printf(" decoding (direct) failed%n"); + } + } + + // The first byte is the length of malformed bytes + static byte[][] malformed = { + // One-byte sequences: + {1, (byte)0xFF }, + {1, (byte)0xC0 }, + {1, (byte)0x80 }, + + {1, (byte)0xFF, (byte)0xFF}, // all ones + {1, (byte)0xA0, (byte)0x80}, // 101x first byte first nibble + + // Two-byte sequences: + {1, (byte)0xC0, (byte)0x80}, // invalid first byte + {1, (byte)0xC1, (byte)0xBF}, // invalid first byte + {1, (byte)0xC2, (byte)0x00}, // invalid second byte + {1, (byte)0xC2, (byte)0xC0}, // invalid second byte + {1, (byte)0xD0, (byte)0x00}, // invalid second byte + {1, (byte)0xD0, (byte)0xC0}, // invalid second byte + {1, (byte)0xDF, (byte)0x00}, // invalid second byte + {1, (byte)0xDF, (byte)0xC0}, // invalid second byte + + // Three-byte sequences + {1, (byte)0xE0, (byte)0x80, (byte)0x80}, // 111x first byte first nibble + {1, (byte)0xE0, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded + {1, (byte)0xE0, (byte)0x81, (byte)0xBF }, // U+007F zero-padded + {1, (byte)0xE0, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded + + {1, (byte)0xE0, (byte)0xC0, (byte)0xBF }, // invalid second byte + {2, (byte)0xE0, (byte)0xA0, (byte)0x7F }, // invalid third byte + {2, (byte)0xE0, (byte)0xA0, (byte)0xC0 }, // invalid third byte + {1, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones + {1, (byte)0xE0, (byte)0xC0, (byte)0x80 }, // invalid second byte + {1, (byte)0xE0, (byte)0x80, (byte)0xC0 }, // invalid first byte + + // Four-byte sequences + {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded + {1, (byte)0xF0, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded + {1, (byte)0xF0, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+007F zero-padded + {1, (byte)0xF0, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+07FF zero-padded + + {1, (byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF }, // all ones + {1, (byte)0xF0, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid second byte + {1, (byte)0xF0, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte + {2, (byte)0xF0, (byte)0x90, (byte)0xC0, (byte)0x80 }, // invalid third byte + {3, (byte)0xF0, (byte)0x90, (byte)0x80, (byte)0xC0 }, // invalid third byte + + {1, (byte)0xF1, (byte)0xC0, (byte)0x80, (byte)0x80 }, // invalid second byte + {2, (byte)0xF1, (byte)0x80, (byte)0xC0, (byte)0x80 }, // invalid third byte + {3, (byte)0xF1, (byte)0x80, (byte)0x80, (byte)0xC0 }, // invalid forth byte + {1, (byte)0xF4, (byte)0x90, (byte)0x80, (byte)0xC0 }, // out-range 4-byte + {1, (byte)0xF4, (byte)0xC0, (byte)0x80, (byte)0xC0 }, // out-range 4-byte + {1, (byte)0xF5, (byte)0x80, (byte)0x80, (byte)0xC0 }, // out-range 4-byte + + // Five-byte sequences + {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80}, // invalid first byte + {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded + {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded + {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded + {5, (byte)0xF8, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded + + {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80}, + {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80 }, + {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF }, + {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0 }, + + // Six-byte sequences + {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, // U+0000 zero-padded + {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x81, (byte)0xBF }, // U+007F zero-padded + {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xBF }, // U+07FF zero-padded + {6, (byte)0xFC, (byte)0x80, (byte)0x80, (byte)0x8F, (byte)0xBF, (byte)0xBF }, // U+FFFF zero-padded + {1, (byte)0xF8, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80, (byte)0x80 }, + {2, (byte)0xF8, (byte)0x80, (byte)0xC0, (byte)0x80, (byte)0x80, (byte)0x80 }, + {3, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0xC1, (byte)0xBF, (byte)0x80 }, + {4, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0xC0, (byte)0x80 }, + {5, (byte)0xF8, (byte)0x80, (byte)0x80, (byte)0x9F, (byte)0x80, (byte)0xC0 }, + }; + + static void checkMalformed(String csn) throws Exception { + boolean failed = false; + System.out.printf(" Check malformed <%s>...%n", csn); + for (boolean direct: new boolean[] {false, true}) { + for (byte[] bins : malformed) { + int mlen = bins[0]; + byte[] bin = Arrays.copyOfRange(bins, 1, bins.length); + CoderResult cr = decodeCR(bin, csn, direct); + String ashex = ""; + for (int i = 0; i < bin.length; i++) { + if (i > 0) ashex += " "; + ashex += Integer.toBinaryString((int)bin[i] & 0xff); + } + if (!cr.isMalformed()) { + System.out.printf(" FAIL(direct=%b): [%s] not malformed.\n", direct, ashex); + failed = true; + } else if (cr.length() != mlen) { + System.out.printf(" FAIL(direct=%b): [%s] malformed[len=%d].\n", direct, ashex, cr.length()); + failed = true; + } + } + } + if (failed) + throw new RuntimeException("Check malformed failed " + csn); + } + + static boolean check(CharsetDecoder dec, byte[] utf8s, boolean direct, int[] flow) { + int inPos = flow[0]; + int inLen = flow[1]; + int outPos = flow[2]; + int outLen = flow[3]; + int expedInPos = flow[4]; + int expedOutPos = flow[5]; + CoderResult expedCR = (flow[6]==0)?CoderResult.UNDERFLOW + :CoderResult.OVERFLOW; + ByteBuffer bbf; + CharBuffer cbf; + if (direct) { + bbf = ByteBuffer.allocateDirect(inPos + utf8s.length); + cbf = ByteBuffer.allocateDirect((outPos + outLen)*2).asCharBuffer(); + } else { + bbf = ByteBuffer.allocate(inPos + utf8s.length); + cbf = CharBuffer.allocate(outPos + outLen); + } + bbf.position(inPos); + bbf.put(utf8s).flip().position(inPos).limit(inPos + inLen); + cbf.position(outPos); + dec.reset(); + CoderResult cr = dec.decode(bbf, cbf, false); + if (cr != expedCR || + bbf.position() != expedInPos || + cbf.position() != expedOutPos) { + System.out.printf("Expected(direct=%5b): [", direct); + for (int i:flow) System.out.print(" " + i); + System.out.println("] CR=" + cr + + ", inPos=" + bbf.position() + + ", outPos=" + cbf.position()); + return false; + } + return true; + } + + static void checkUnderOverflow(String csn) throws Exception { + System.out.printf(" Check under/overflow <%s>...%n", csn); + CharsetDecoder dec = Charset.forName(csn).newDecoder(); + boolean failed = false; + byte[] utf8s = new String("\u007f\u07ff\ue000\ud800\udc00").getBytes("UTF-8"); + int inlen = utf8s.length; + + for (int inoff = 0; inoff < 20; inoff++) { + for (int outoff = 0; outoff < 20; outoff++) { + int[][] Flows = { + //inpos, inLen, outPos, outLen, inPosEP, outposEP, under(0)/over(1) + {inoff, inlen, outoff, 1, inoff + 1, outoff + 1, 1}, + {inoff, inlen, outoff, 2, inoff + 3, outoff + 2, 1}, + {inoff, inlen, outoff, 3, inoff + 6, outoff + 3, 1}, + {inoff, inlen, outoff, 4, inoff + 6, outoff + 3, 1}, + {inoff, inlen, outoff, 5, inoff + 10,outoff + 5, 0}, + // underflow + {inoff, 1, outoff, 5, inoff + 1, outoff + 1, 0}, + {inoff, 2, outoff, 5, inoff + 1, outoff + 1, 0}, + {inoff, 3, outoff, 5, inoff + 3, outoff + 2, 0}, + {inoff, 4, outoff, 5, inoff + 3, outoff + 2, 0}, + {inoff, 5, outoff, 5, inoff + 3, outoff + 2, 0}, + {inoff, 6, outoff, 5, inoff + 6, outoff + 3, 0}, + {inoff, 7, outoff, 5, inoff + 6, outoff + 3, 0}, + {inoff, 8, outoff, 5, inoff + 6, outoff + 3, 0}, + {inoff, 9, outoff, 5, inoff + 6, outoff + 3, 0}, + {inoff, 10, outoff, 5, inoff + 10,outoff + 5, 0}, + // 2-byte underflow/overflow + {inoff, 2, outoff, 1, inoff + 1, outoff + 1, 0}, + {inoff, 3, outoff, 1, inoff + 1, outoff + 1, 1}, + // 3-byte underflow/overflow + {inoff, 4, outoff, 2, inoff + 3, outoff + 2, 0}, + {inoff, 5, outoff, 2, inoff + 3, outoff + 2, 0}, + {inoff, 6, outoff, 2, inoff + 3, outoff + 2, 1}, + // 4-byte underflow/overflow + {inoff, 7, outoff, 4, inoff + 6, outoff + 3, 0}, + {inoff, 8, outoff, 4, inoff + 6, outoff + 3, 0}, + {inoff, 9, outoff, 4, inoff + 6, outoff + 3, 0}, + {inoff, 10, outoff, 4, inoff + 6, outoff + 3, 1}, + }; + for (boolean direct: new boolean[] {false, true}) { + for (int[] flow: Flows) { + if (!check(dec, utf8s, direct, flow)) + failed = true; + } + }}} + if (failed) + throw new RuntimeException("Check under/overflow failed " + csn); + } + + public static void main(String[] args) throws Exception { + checkRoundtrip("UTF-8"); + check6ByteSurrs("UTF-8"); + //compare("UTF-8", "UTF-8-OLD"); + checkMalformed("UTF-8"); + checkUnderOverflow("UTF-8"); + } +} |