/* ****************************************************************************** * * Copyright (C) 2002-2003, International Business Machines * Corporation and others. All Rights Reserved. * ****************************************************************************** * file name: bocu1.h * encoding: US-ASCII * tab size: 8 (not used) * indentation:4 * * created on: 2002jan24 * created by: Markus W. Scherer * * This is the definition file for the sample implementation of BOCU-1, * a MIME-compatible Binary Ordered Compression for Unicode. */ #ifndef __BOCU1_H__ #define __BOCU1_H__ /* * Standard ICU header. * - Includes inttypes.h or defines its types. * - Defines UChar for UTF-16 as an unsigned 16-bit type (wchar_t or uint16_t). * - Defines UTF* macros to handle reading and writing * of in-process UTF-8/16 strings. */ #include "unicode/utypes.h" /* BOCU-1 constants and macros ---------------------------------------------- */ /* * BOCU-1 encodes the code points of a Unicode string as * a sequence of byte-encoded differences (slope detection), * preserving lexical order. * * Optimize the difference-taking for runs of Unicode text within * small scripts: * * Most small scripts are allocated within aligned 128-blocks of Unicode * code points. Lexical order is preserved if the "previous code point" state * is always moved into the middle of such a block. * * Additionally, "prev" is moved from anywhere in the Unihan and Hangul * areas into the middle of those areas. * * C0 control codes and space are encoded with their US-ASCII bytes. * "prev" is reset for C0 controls but not for space. */ /* initial value for "prev": middle of the ASCII range */ #define BOCU1_ASCII_PREV 0x40 /* bounding byte values for differences */ #define BOCU1_MIN 0x21 #define BOCU1_MIDDLE 0x90 #define BOCU1_MAX_LEAD 0xfe /* add the L suffix to make computations with BOCU1_MAX_TRAIL work on 16-bit compilers */ #define BOCU1_MAX_TRAIL 0xffL #define BOCU1_RESET 0xff /* number of lead bytes */ #define BOCU1_COUNT (BOCU1_MAX_LEAD-BOCU1_MIN+1) /* adjust trail byte counts for the use of some C0 control byte values */ #define BOCU1_TRAIL_CONTROLS_COUNT 20 #define BOCU1_TRAIL_BYTE_OFFSET (BOCU1_MIN-BOCU1_TRAIL_CONTROLS_COUNT) /* number of trail bytes */ #define BOCU1_TRAIL_COUNT ((BOCU1_MAX_TRAIL-BOCU1_MIN+1)+BOCU1_TRAIL_CONTROLS_COUNT) /* * number of positive and negative single-byte codes * (counting 0==BOCU1_MIDDLE among the positive ones) */ #define BOCU1_SINGLE 64 /* number of lead bytes for positive and negative 2/3/4-byte sequences */ #define BOCU1_LEAD_2 43 #define BOCU1_LEAD_3 3 #define BOCU1_LEAD_4 1 /* The difference value range for single-byters. */ #define BOCU1_REACH_POS_1 (BOCU1_SINGLE-1) #define BOCU1_REACH_NEG_1 (-BOCU1_SINGLE) /* The difference value range for double-byters. */ #define BOCU1_REACH_POS_2 (BOCU1_REACH_POS_1+BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) #define BOCU1_REACH_NEG_2 (BOCU1_REACH_NEG_1-BOCU1_LEAD_2*BOCU1_TRAIL_COUNT) /* The difference value range for 3-byters. */ #define BOCU1_REACH_POS_3 \ (BOCU1_REACH_POS_2+BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) #define BOCU1_REACH_NEG_3 (BOCU1_REACH_NEG_2-BOCU1_LEAD_3*BOCU1_TRAIL_COUNT*BOCU1_TRAIL_COUNT) /* The lead byte start values. */ #define BOCU1_START_POS_2 (BOCU1_MIDDLE+BOCU1_REACH_POS_1+1) #define BOCU1_START_POS_3 (BOCU1_START_POS_2+BOCU1_LEAD_2) #define BOCU1_START_POS_4 (BOCU1_START_POS_3+BOCU1_LEAD_3) /* ==BOCU1_MAX_LEAD */ #define BOCU1_START_NEG_2 (BOCU1_MIDDLE+BOCU1_REACH_NEG_1) #define BOCU1_START_NEG_3 (BOCU1_START_NEG_2-BOCU1_LEAD_2) #define BOCU1_START_NEG_4 (BOCU1_START_NEG_3-BOCU1_LEAD_3) /* ==BOCU1_MIN+1 */ /* The length of a byte sequence, according to the lead byte (!=BOCU1_RESET). */ #define BOCU1_LENGTH_FROM_LEAD(lead) \ ((BOCU1_START_NEG_2<=(lead) && (lead)>24 : 4) /* * 12 commonly used C0 control codes (and space) are only used to encode * themselves directly, * which makes BOCU-1 MIME-usable and reasonably safe for * ASCII-oriented software. * * These controls are * 0 NUL * * 7 BEL * 8 BS * * 9 TAB * a LF * b VT * c FF * d CR * * e SO * f SI * * 1a SUB * 1b ESC * * The other 20 C0 controls are also encoded directly (to preserve order) * but are also used as trail bytes in difference encoding * (for better compression). */ #define BOCU1_TRAIL_TO_BYTE(t) ((t)>=BOCU1_TRAIL_CONTROLS_COUNT ? (t)+BOCU1_TRAIL_BYTE_OFFSET : bocu1TrailToByte[t]) /* * Byte value map for control codes, * from external byte values 0x00..0x20 * to trail byte values 0..19 (0..0x13) as used in the difference calculation. * External byte values that are illegal as trail bytes are mapped to -1. */ static int8_t bocu1ByteToTrail[BOCU1_MIN]={ /* 0 1 2 3 4 5 6 7 */ -1, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, -1, /* 8 9 a b c d e f */ -1, -1, -1, -1, -1, -1, -1, -1, /* 10 11 12 13 14 15 16 17 */ 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, /* 18 19 1a 1b 1c 1d 1e 1f */ 0x0e, 0x0f, -1, -1, 0x10, 0x11, 0x12, 0x13, /* 20 */ -1 }; /* * Byte value map for control codes, * from trail byte values 0..19 (0..0x13) as used in the difference calculation * to external byte values 0x00..0x20. */ static int8_t bocu1TrailToByte[BOCU1_TRAIL_CONTROLS_COUNT]={ /* 0 1 2 3 4 5 6 7 */ 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x10, 0x11, /* 8 9 a b c d e f */ 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, /* 10 11 12 13 */ 0x1c, 0x1d, 0x1e, 0x1f }; /** * Integer division and modulo with negative numerators * yields negative modulo results and quotients that are one more than * what we need here. * This macro adjust the results so that the modulo-value m is always >=0. * * For positive n, the if() condition is always FALSE. * * @param n Number to be split into quotient and rest. * Will be modified to contain the quotient. * @param d Divisor. * @param m Output variable for the rest (modulo result). */ #define NEGDIVMOD(n, d, m) { \ (m)=(n)%(d); \ (n)/=(d); \ if((m)<0) { \ --(n); \ (m)+=(d); \ } \ } /* State for BOCU-1 decoder function. */ struct Bocu1Rx { int32_t prev, count, diff; }; typedef struct Bocu1Rx Bocu1Rx; /* Function prototypes ------------------------------------------------------ */ /* see bocu1.c */ U_CFUNC int32_t packDiff(int32_t diff); U_CFUNC int32_t encodeBocu1(int32_t *pPrev, int32_t c); U_CFUNC int32_t decodeBocu1(Bocu1Rx *pRx, uint8_t b); #endif