/** ******************************************************************************* * Copyright (C) 2002-2004, International Business Machines Corporation and * * others. All Rights Reserved. * ******************************************************************************* */ package com.ibm.icu.dev.test; /** * Utility class for supplementary code point * support. This one is written purely for updating * Normalization sample from the unicode.org site. * If you want the real thing, use UTF16 class * from ICU4J * @author Vladimir Weinstein, Markus Scherer */ public class UTF16Util { static final int suppOffset = (0xd800 << 10) + 0xdc00 - 0x10000; /** * Method nextCodePoint. Returns the next code point * in a string. * @param s String in question * @param i index from which we want a code point * @return int codepoint at index i */ public static final int nextCodePoint(String s, int i) { int ch = s.charAt(i); if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) { int ch2 = s.charAt(i); if (0xdc00 <= ch2 && ch2 <= 0xdfff) { ch = (ch << 10) + ch2 - suppOffset; } } return ch; } /** * Method prevCodePoint. Gets the code point preceding * index i (predecrement). * @param s String in question * @param i index in string * @return int codepoint at index --i */ public static final int prevCodePoint(String s, int i) { int ch = s.charAt(--i); if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) { int ch2 = s.charAt(i); if (0xd800 <= ch2 && ch2 <= 0xdbff) { ch = (ch2 << 10) + ch - suppOffset; } } return ch; } /** * Method nextCodePoint. Returns the next code point * in a string. * @param s StringBuffer in question * @param i index from which we want a code point * @return int codepoint at index i */ public static final int nextCodePoint(StringBuffer s, int i) { int ch = s.charAt(i); if (0xd800 <= ch && ch <= 0xdbff && ++i < s.length()) { int ch2 = s.charAt(i); if (0xdc00 <= ch2 && ch2 <= 0xdfff) { ch = (ch << 10) + ch2 - suppOffset; } } return ch; } /** * Method prevCodePoint. Gets the code point preceding * index i (predecrement). * @param s StringBuffer in question * @param i index in string * @return int codepoint at index --i */ public static final int prevCodePoint(StringBuffer s, int i) { int ch = s.charAt(--i); if (0xdc00 <= ch && ch <= 0xdfff && --i >= 0) { int ch2 = s.charAt(i); if (0xd800 <= ch2 && ch2 <= 0xdbff) { ch = (ch2 << 10) + ch - suppOffset; } } return ch; } /** * Method codePointLength. Returns the length * in UTF-16 code units of a given code point * @param c code point in question * @return int length in UTF-16 code units. Can be 1 or 2 */ public static final int codePointLength(int c) { return c <= 0xffff ? 1 : 2; } /** * Method appendCodePoint. Appends a code point * to a StringBuffer * @param buffer StringBuffer in question * @param ch code point to append */ public static final void appendCodePoint(StringBuffer buffer, int ch) { if (ch <= 0xffff) { buffer.append((char)ch); } else { buffer.append((char)(0xd7c0 + (ch >> 10))); buffer.append((char)(0xdc00 + (ch & 0x3ff))); } } /** * Method insertCodePoint. Inserts a code point in * a StringBuffer * @param buffer StringBuffer in question * @param i index at which we want code point to be inserted * @param ch code point to be inserted */ public static final void insertCodePoint(StringBuffer buffer, int i, int ch) { if (ch <= 0xffff) { buffer.insert(i, (char)ch); } else { buffer.insert(i, (char)(0xd7c0 + (ch >> 10))).insert(i + 1, (char)(0xdc00 + (ch & 0x3ff))); } } /** * Method setCodePointAt. Changes a code point at a * given index. Can change the length of the string. * @param buffer StringBuffer in question * @param i index at which we want to change the contents * @param ch replacement code point * @return int difference in resulting StringBuffer length */ public static final int setCodePointAt(StringBuffer buffer, int i, int ch) { int cp = nextCodePoint(buffer, i); if (ch <= 0xffff && cp <= 0xffff) { // Both BMP buffer.setCharAt(i, (char)ch); return 0; } else if (ch > 0xffff && cp > 0xffff) { // Both supplementary buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10))); buffer.setCharAt(i+1, (char)(0xdc00 + (ch & 0x3ff))); return 0; } else if (ch <= 0xffff && cp > 0xffff) { // putting BMP instead of supplementary, buffer shrinks buffer.setCharAt(i, (char)ch); buffer.deleteCharAt(i+1); return -1; } else { //if (ch > 0xffff && cp <= 0xffff) { // putting supplementary instead of BMP, buffer grows buffer.setCharAt(i, (char)(0xd7c0 + (ch >> 10))); buffer.insert(i+1, (char)(0xdc00 + (ch & 0x3ff))); return 1; } } /** * Method countCodePoint. Counts the UTF-32 code points * in a UTF-16 encoded string. * @param source String in question. * @return int number of code points in this string */ public static final int countCodePoint(String source) { int result = 0; char ch; boolean hadLeadSurrogate = false; for (int i = 0; i < source.length(); ++ i) { ch = source.charAt(i); if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) { hadLeadSurrogate = false; // count valid trail as zero } else { hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff); ++ result; // count others as 1 } } return result; } /** * Method countCodePoint. Counts the UTF-32 code points * in a UTF-16 encoded string. * @param source StringBuffer in question. * @return int number of code points in this string */ public static final int countCodePoint(StringBuffer source) { int result = 0; char ch; boolean hadLeadSurrogate = false; for (int i = 0; i < source.length(); ++ i) { ch = source.charAt(i); if (hadLeadSurrogate && 0xdc00 <= ch && ch <= 0xdfff) { hadLeadSurrogate = false; // count valid trail as zero } else { hadLeadSurrogate = (0xd800 <= ch && ch <= 0xdbff); ++ result; // count others as 1 } } return result; } /** * The minimum value for Supplementary code points */ public static final int SUPPLEMENTARY_MIN_VALUE = 0x10000; /** * Determines how many chars this char32 requires. * If a validity check is required, use * isLegal() on * char32 before calling. * @param char32 the input codepoint. * @return 2 if is in supplementary space, otherwise 1. */ public static int getCharCount(int char32) { if (char32 < SUPPLEMENTARY_MIN_VALUE) { return 1; } return 2; } /** * Lead surrogate maximum value * @stable ICU 2.1 */ public static final int LEAD_SURROGATE_MAX_VALUE = 0xDBFF; /** * Lead surrogate minimum value * @stable ICU 2.1 */ public static final int LEAD_SURROGATE_MIN_VALUE = 0xD800; /** * Trail surrogate minimum value * @stable ICU 2.1 */ public static final int TRAIL_SURROGATE_MIN_VALUE = 0xDC00; /** * Trail surrogate maximum value * @stable ICU 2.1 */ public static final int TRAIL_SURROGATE_MAX_VALUE = 0xDFFF; /** * Determines whether the code value is a surrogate. * @param char16 the input character. * @return true iff the input character is a surrogate. * @stable ICU 2.1 */ public static boolean isSurrogate(char char16) { return LEAD_SURROGATE_MIN_VALUE <= char16 && char16 <= TRAIL_SURROGATE_MAX_VALUE; } /** * Determines whether the character is a trail surrogate. * @param char16 the input character. * @return true iff the input character is a trail surrogate. * @stable ICU 2.1 */ public static boolean isTrailSurrogate(char char16) { return (TRAIL_SURROGATE_MIN_VALUE <= char16 && char16 <= TRAIL_SURROGATE_MAX_VALUE); } /** * Determines whether the character is a lead surrogate. * @param char16 the input character. * @return true iff the input character is a lead surrogate * @stable ICU 2.1 */ public static boolean isLeadSurrogate(char char16) { return LEAD_SURROGATE_MIN_VALUE <= char16 && char16 <= LEAD_SURROGATE_MAX_VALUE; } /** * Extract a single UTF-32 value from a substring. * Used when iterating forwards or backwards (with * UTF16.getCharCount(), as well as random access. If a * validity check is required, use * UCharacter.isLegal() * on the return value. * If the char retrieved is part of a surrogate pair, its supplementary * character will be returned. If a complete supplementary character is * not found the incomplete character will be returned * @param source array of UTF-16 chars * @param start offset to substring in the source array for analyzing * @param limit offset to substring in the source array for analyzing * @param offset16 UTF-16 offset relative to start * @return UTF-32 value for the UTF-32 value that contains the char at * offset16. The boundaries of that codepoint are the same as in * bounds32(). * @exception IndexOutOfBoundsException thrown if offset16 is not within * the range of start and limit. * @stable ICU 2.1 */ public static int charAt(char source[], int start, int limit, int offset16) { offset16 += start; if (offset16 < start || offset16 >= limit) { throw new ArrayIndexOutOfBoundsException(offset16); } char single = source[offset16]; if (!isSurrogate(single)) { return single; } // Convert the UTF-16 surrogate pair if necessary. // For simplicity in usage, and because the frequency of pairs is // low, look both directions. if (single <= LEAD_SURROGATE_MAX_VALUE) { offset16 ++; if (offset16 >= limit) { return single; } char trail = source[offset16]; if (isTrailSurrogate(trail)) { return getRawSupplementary(single, trail); } } else { // isTrailSurrogate(single), so if (offset16 == start) { return single; } offset16 --; char lead = source[offset16]; if (isLeadSurrogate(lead)) return getRawSupplementary(lead, single); } return single; // return unmatched surrogate } /** * Shift value for lead surrogate to form a supplementary character. */ private static final int LEAD_SURROGATE_SHIFT_ = 10; /** * Offset to add to combined surrogate pair to avoid msking. */ private static final int SURROGATE_OFFSET_ = SUPPLEMENTARY_MIN_VALUE - (LEAD_SURROGATE_MIN_VALUE << LEAD_SURROGATE_SHIFT_) - TRAIL_SURROGATE_MIN_VALUE; /** * Forms a supplementary code point from the argument character
* Note this is for internal use hence no checks for the validity of the * surrogate characters are done * @param lead lead surrogate character * @param trail trailing surrogate character * @return code point of the supplementary character */ public static int getRawSupplementary(char lead, char trail) { return (lead << LEAD_SURROGATE_SHIFT_) + trail + SURROGATE_OFFSET_; } }