X-Git-Url: http://git.asbjorn.it/?a=blobdiff_plain;f=lib%2Fgocr%2Funicode.c;fp=lib%2Fgocr%2Funicode.c;h=d8ed703676e5e7c86bcf4cc25f8d3c9d73152b61;hb=8154e11e1c06aefe18c16b33f2b12d6de21273a4;hp=0000000000000000000000000000000000000000;hpb=e8fe2f290123fc66181709a8a5263ad9e91c6939;p=swftools.git diff --git a/lib/gocr/unicode.c b/lib/gocr/unicode.c new file mode 100644 index 0000000..d8ed703 --- /dev/null +++ b/lib/gocr/unicode.c @@ -0,0 +1,1314 @@ +/* +This is a Optical-Character-Recognition program +Copyright (C) 2000-2007 Joerg Schulenburg + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + + see README for EMAIL-address + */ + +#include "unicode.h" +#include + +/* FIXME jb global */ +int warn=0; /* if 1 a message is generated if composition is not defined */ + +/* Arguments: the character (main), and the modifier (accent, etc). See the + function if you want to know the modifiers. + Description: This function intends to be a small helper, to avoid having + to write switches in functions. It's therefore mainly to accents, and + specially for the most usual ones. It supports the basic greek + characters too, which is actually not very helpful. + Returns: the unicode character corresponding to the composed character. + + ToDo: + - It seems to me, that tables should be more effectiv. + So we should use tables in future? (js) + */ +wchar_t compose(wchar_t main, wchar_t modifier) { +/* supported by now: part of ISO8859-1, basic greek characters */ + if( main == UNKNOWN || main == PICTURE ) return main; +#ifdef DEBUG + if(modifier!=UNICODE_NULL && modifier!=SPACE) + printf(" compose(%c,%d)",(char)main,(int)modifier); +#endif + if(main>127 && modifier!=0 && modifier!=SPACE && warn) + fprintf(stderr,"# Warning compose %04x + %04x>127\n", + (int)modifier,(int)main); + switch (modifier) { + case UNICODE_NULL: + case SPACE: + return (wchar_t)main; + + case APOSTROPHE: /* do NOT USE this. It's here for compatibility only. + Use ACUTE_ACCENT instead. */ + fprintf( stderr, "COMPOSE: got APOSTROPHE instead of ACUTE_ACCENT"); + + case ACUTE_ACCENT: /* acute/cedilla */ + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_ACUTE; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_ACUTE; + case LATIN_SMALL_LETTER_AE: return LATIN_SMALL_LETTER_AE_WITH_ACUTE; + case LATIN_CAPITAL_LETTER_AE: return LATIN_CAPITAL_LETTER_AE_WITH_ACUTE; + case 'c': return LATIN_SMALL_LETTER_C_WITH_ACUTE; + case 'C': return LATIN_CAPITAL_LETTER_C_WITH_ACUTE; + case 'e': return LATIN_SMALL_LETTER_E_WITH_ACUTE; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_ACUTE; + case 'g': return LATIN_SMALL_LETTER_G_WITH_ACUTE; + case 'G': return LATIN_CAPITAL_LETTER_G_WITH_ACUTE; + case 'i': return LATIN_SMALL_LETTER_I_WITH_ACUTE; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_ACUTE; + case 'l': return LATIN_SMALL_LETTER_L_WITH_ACUTE; + case 'L': return LATIN_CAPITAL_LETTER_L_WITH_ACUTE; + case 'n': return LATIN_SMALL_LETTER_N_WITH_ACUTE; + case 'N': return LATIN_CAPITAL_LETTER_N_WITH_ACUTE; + case 'o': return LATIN_SMALL_LETTER_O_WITH_ACUTE; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_ACUTE; + case '0': return LATIN_CAPITAL_LETTER_O_WITH_ACUTE; + case 'r': return LATIN_SMALL_LETTER_R_WITH_ACUTE; + case 'R': return LATIN_CAPITAL_LETTER_R_WITH_ACUTE; + case 's': return LATIN_SMALL_LETTER_S_WITH_ACUTE; + case 'S': return LATIN_CAPITAL_LETTER_S_WITH_ACUTE; + case 'u': return LATIN_SMALL_LETTER_U_WITH_ACUTE; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_ACUTE; + case 'y': return LATIN_SMALL_LETTER_Y_WITH_ACUTE; + case 'Y': return LATIN_CAPITAL_LETTER_Y_WITH_ACUTE; + case 'z': return LATIN_SMALL_LETTER_Z_WITH_ACUTE; + case 'Z': return LATIN_CAPITAL_LETTER_Z_WITH_ACUTE; + default: + if(warn)fprintf( stderr, " COMPOSE: ACUTE_ACCENT+%04x not defined\n",(int)main); + } + break; + + case BREVE: /* caron (latin2) "u"-above-... (small bow) */ + switch (main) { + /* FIXME write separate heuristics for breve */ + case 'a': return LATIN_SMALL_LETTER_A_WITH_BREVE; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_BREVE; + case 'e': return LATIN_SMALL_LETTER_E_WITH_BREVE; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_BREVE; + case 'g': return LATIN_SMALL_LETTER_G_WITH_BREVE; + case 'G': return LATIN_CAPITAL_LETTER_G_WITH_BREVE; + case 'i': return LATIN_SMALL_LETTER_I_WITH_BREVE; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_BREVE; + case 'o': return LATIN_SMALL_LETTER_O_WITH_BREVE; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_BREVE; + case 'u': return LATIN_SMALL_LETTER_U_WITH_BREVE; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_BREVE; + default: + if(warn)fprintf( stderr, " COMPOSE: BREVE+%04x not defined\n",(int)main); + } + break; + + case CARON: /* caron (latin2) "v"-above-... */ + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_CARON; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_CARON; + case 'c': return LATIN_SMALL_LETTER_C_WITH_CARON; + case 'C': return LATIN_CAPITAL_LETTER_C_WITH_CARON; + case 'e': return LATIN_SMALL_LETTER_E_WITH_CARON; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_CARON; + case 'i': return LATIN_SMALL_LETTER_I_WITH_CARON; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_CARON; + case 'o': return LATIN_SMALL_LETTER_O_WITH_CARON; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_CARON; + case '0': return LATIN_CAPITAL_LETTER_O_WITH_CARON; + case 's': return LATIN_SMALL_LETTER_S_WITH_CARON; + case 'S': return LATIN_CAPITAL_LETTER_S_WITH_CARON; + case 'u': return LATIN_SMALL_LETTER_U_WITH_CARON; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_CARON; + case 'z': return LATIN_SMALL_LETTER_Z_WITH_CARON; + case 'Z': return LATIN_CAPITAL_LETTER_Z_WITH_CARON; + default: + if(warn)fprintf( stderr, " COMPOSE: CARON+%04x not defined\n",(int)main); + } + break; + + case CEDILLA: + switch (main) { + case 'c': return LATIN_SMALL_LETTER_C_WITH_CEDILLA; + case 'C': return LATIN_CAPITAL_LETTER_C_WITH_CEDILLA; + default: + if(warn)fprintf( stderr, " COMPOSE: CEDILLA+%04x not defined\n",(int)main); + } + break; + + case TILDE: + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_TILDE; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_TILDE; + case 'i': return LATIN_SMALL_LETTER_I_WITH_TILDE; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_TILDE; + case 'n': return LATIN_SMALL_LETTER_N_WITH_TILDE; + case 'N': return LATIN_CAPITAL_LETTER_N_WITH_TILDE; + case 'o': return LATIN_SMALL_LETTER_O_WITH_TILDE; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_TILDE; + case '0': return LATIN_CAPITAL_LETTER_O_WITH_TILDE; + case 'u': return LATIN_SMALL_LETTER_U_WITH_TILDE; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_TILDE; + default: + if(warn)fprintf( stderr, " COMPOSE: TILDE+%04x not defined\n",(int)main); + } + break; + + case GRAVE_ACCENT: + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_GRAVE; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_GRAVE; + case 'e': return LATIN_SMALL_LETTER_E_WITH_GRAVE; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_GRAVE; + case 'i': return LATIN_SMALL_LETTER_I_WITH_GRAVE; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_GRAVE; + case 'n': return LATIN_SMALL_LETTER_N_WITH_GRAVE; + case 'N': return LATIN_CAPITAL_LETTER_N_WITH_GRAVE; + case 'o': return LATIN_SMALL_LETTER_O_WITH_GRAVE; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_GRAVE; + case '0': return LATIN_CAPITAL_LETTER_O_WITH_GRAVE; + case 'u': return LATIN_SMALL_LETTER_U_WITH_GRAVE; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_GRAVE; + default: + if(warn)fprintf( stderr, " COMPOSE: GRAVE_ACCENT+%04x not defined\n",(int)main); + } + break; + + case QUOTATION_MARK: /* do NOT USE this. It's here for compatibility only. + Use DIAERESIS instead. */ + fprintf( stderr, "COMPOSE: got APOSTROPHE instead of ACUTE_ACCENT"); + + case DIAERESIS: + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_DIAERESIS; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS; + case 'e': return LATIN_SMALL_LETTER_E_WITH_DIAERESIS; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS; + case 'i': return LATIN_SMALL_LETTER_I_WITH_DIAERESIS; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS; + case 'o': return LATIN_SMALL_LETTER_O_WITH_DIAERESIS; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS; + case '0': return LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS; + case 'u': return LATIN_SMALL_LETTER_U_WITH_DIAERESIS; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS; + case 'y': return LATIN_SMALL_LETTER_Y_WITH_DIAERESIS; + case 'Y': return LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS; + default: + if(warn)fprintf( stderr, " COMPOSE: DIAERESIS+%04x (%c) not defined\n",(int)main,(char)main); + } + break; + + case CIRCUMFLEX_ACCENT: /* ^ */ + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX; + case 'c': return LATIN_SMALL_LETTER_C_WITH_CIRCUMFLEX; + case 'C': return LATIN_CAPITAL_LETTER_C_WITH_CIRCUMFLEX; + case 'e': return LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX; + case 'g': return LATIN_SMALL_LETTER_G_WITH_CIRCUMFLEX; + case 'G': return LATIN_CAPITAL_LETTER_G_WITH_CIRCUMFLEX; + case 'h': return LATIN_SMALL_LETTER_H_WITH_CIRCUMFLEX; + case 'H': return LATIN_CAPITAL_LETTER_H_WITH_CIRCUMFLEX; + case 'i': return LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX; + case 'j': return LATIN_SMALL_LETTER_J_WITH_CIRCUMFLEX; + case 'J': return LATIN_CAPITAL_LETTER_J_WITH_CIRCUMFLEX; + case 'o': return LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX; + case '0': return LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX; + case 's': return LATIN_SMALL_LETTER_S_WITH_CIRCUMFLEX; + case 'S': return LATIN_CAPITAL_LETTER_S_WITH_CIRCUMFLEX; + case 'u': return LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX; + case 'w': return LATIN_SMALL_LETTER_W_WITH_CIRCUMFLEX; + case 'W': return LATIN_CAPITAL_LETTER_W_WITH_CIRCUMFLEX; + case 'y': return LATIN_SMALL_LETTER_Y_WITH_CIRCUMFLEX; + case 'Y': return LATIN_CAPITAL_LETTER_Y_WITH_CIRCUMFLEX; + default: + if(warn)fprintf( stderr, " COMPOSE: CIRCUMFLEX_ACCENT+%04x not defined\n",(int)main); + } + break; + + case MACRON: /* a minus sign above the char (latin2) */ + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_MACRON; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_MACRON; + case 'e': return LATIN_SMALL_LETTER_E_WITH_MACRON; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_MACRON; + case 'i': return LATIN_SMALL_LETTER_I_WITH_MACRON; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_MACRON; + case 'o': return LATIN_SMALL_LETTER_O_WITH_MACRON; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_MACRON; + case 'u': return LATIN_SMALL_LETTER_U_WITH_MACRON; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_MACRON; + case 'y': return LATIN_SMALL_LETTER_Y_WITH_MACRON; + case 'Y': return LATIN_CAPITAL_LETTER_Y_WITH_MACRON; + case LATIN_SMALL_LETTER_AE: return LATIN_SMALL_LETTER_AE_WITH_MACRON; + case LATIN_CAPITAL_LETTER_AE: return LATIN_CAPITAL_LETTER_AE_WITH_MACRON; + case '=': return IDENTICAL_TO; + case '-': return '='; + case ' ': return MODIFIER_LETTER_MACRON; + default: + if(warn)fprintf( stderr, " COMPOSE: MACRON+%04x not defined\n",(int)main); + } + break; + + case DOT_ABOVE: /* latin2 */ + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_DOT_ABOVE; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_DOT_ABOVE; + case 'c': return LATIN_SMALL_LETTER_C_WITH_DOT_ABOVE; + case 'C': return LATIN_CAPITAL_LETTER_C_WITH_DOT_ABOVE; + case 'e': return LATIN_SMALL_LETTER_E_WITH_DOT_ABOVE; + case 'E': return LATIN_CAPITAL_LETTER_E_WITH_DOT_ABOVE; + case 'g': return LATIN_SMALL_LETTER_G_WITH_DOT_ABOVE; + case 'G': return LATIN_CAPITAL_LETTER_G_WITH_DOT_ABOVE; + case 'l': return 'i'; /* correct wrong recognition */ + case 'i': return 'i'; + case LATIN_SMALL_LETTER_DOTLESS_I: return 'i'; + case 'I': return LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE; + case 'j': return 'j'; + case 'o': return LATIN_SMALL_LETTER_O_WITH_DOT_ABOVE; + case 'O': return LATIN_CAPITAL_LETTER_O_WITH_DOT_ABOVE; + case 'z': return LATIN_SMALL_LETTER_Z_WITH_DOT_ABOVE; + case 'Z': return LATIN_CAPITAL_LETTER_Z_WITH_DOT_ABOVE; + case ',': return ';'; + case '.': return ':'; + default: + if(warn)fprintf( stderr, " COMPOSE: DOT_ABOVE+%04x not defined\n",(int)main); + } + break; + + case RING_ABOVE: + switch (main) { + case 'a': return LATIN_SMALL_LETTER_A_WITH_RING_ABOVE; + case 'A': return LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE; + case 'u': return LATIN_SMALL_LETTER_U_WITH_RING_ABOVE; + case 'U': return LATIN_CAPITAL_LETTER_U_WITH_RING_ABOVE; + default: + if(warn)fprintf( stderr, " COMPOSE: RING_ABOVE+%04x not defined\n",(int)main); + } + break; + + case 'e': /* e ligatures: ae, oe. */ + case 'E': + switch (main) { + case 'a': return LATIN_SMALL_LETTER_AE; + case 'A': return LATIN_CAPITAL_LETTER_AE; + case 'o': return LATIN_SMALL_LIGATURE_OE; + case 'O': return LATIN_CAPITAL_LIGATURE_OE; + case '0': return LATIN_CAPITAL_LIGATURE_OE; + default: + if(warn)fprintf( stderr, " COMPOSE: %04x+e/E not defined\n",(int)main); + } + break; + + case 'g': /* greek */ + switch (main) { + /* missing 0x37A-0x390 */ + /* weird cases: Q -> theta (it resembles a little, doesn't it?) + V -> psi (what can I do?) */ + case 'A': return GREEK_CAPITAL_LETTER_ALPHA; + case 'B': return GREEK_CAPITAL_LETTER_BETA; + case 'G': return GREEK_CAPITAL_LETTER_GAMMA; + case 'D': return GREEK_CAPITAL_LETTER_DELTA; + case 'E': return GREEK_CAPITAL_LETTER_EPSILON; + case 'Z': return GREEK_CAPITAL_LETTER_ZETA; + case 'H': return GREEK_CAPITAL_LETTER_ETA; + case 'Q': return GREEK_CAPITAL_LETTER_THETA; + case 'I': return GREEK_CAPITAL_LETTER_IOTA; + case 'K': return GREEK_CAPITAL_LETTER_KAPPA; + case 'L': return GREEK_CAPITAL_LETTER_LAMDA; + case 'M': return GREEK_CAPITAL_LETTER_MU; + case 'N': return GREEK_CAPITAL_LETTER_NU; + case 'X': return GREEK_CAPITAL_LETTER_XI; + case 'O': return GREEK_CAPITAL_LETTER_OMICRON; + case 'P': return GREEK_CAPITAL_LETTER_PI; + case 'R': return GREEK_CAPITAL_LETTER_RHO; + case 'S': return GREEK_CAPITAL_LETTER_SIGMA; + case 'T': return GREEK_CAPITAL_LETTER_TAU; + case 'Y': return GREEK_CAPITAL_LETTER_UPSILON; + case 'F': return GREEK_CAPITAL_LETTER_PHI; + case 'C': return GREEK_CAPITAL_LETTER_CHI; + case 'V': return GREEK_CAPITAL_LETTER_PSI; + case 'W': return GREEK_CAPITAL_LETTER_OMEGA; +/* + case '': return GREEK_CAPITAL_LETTER_IOTA_WITH_DIALYTIKA; + case '': return GREEK_CAPITAL_LETTER_UPSILON_WITH_DIALYTIKA; + case '': return GREEK_SMALL_LETTER_ALPHA_WITH_TONOS; + case '': return GREEK_SMALL_LETTER_EPSILON_WITH_TONOS; + case '': return GREEK_SMALL_LETTER_ETA_WITH_TONOS; + case '': return GREEK_SMALL_LETTER_IOTA_WITH_TONOS; + case '': return GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS; +*/ + case 'a': return GREEK_SMALL_LETTER_ALPHA; + case 'b': return GREEK_SMALL_LETTER_BETA; + case 'g': return GREEK_SMALL_LETTER_GAMMA; + case 'd': return GREEK_SMALL_LETTER_DELTA; + case 'e': return GREEK_SMALL_LETTER_EPSILON; + case 'z': return GREEK_SMALL_LETTER_ZETA; + case 'h': return GREEK_SMALL_LETTER_ETA; + case 'q': return GREEK_SMALL_LETTER_THETA; + case 'i': return GREEK_SMALL_LETTER_IOTA; + case 'k': return GREEK_SMALL_LETTER_KAPPA; + case 'l': return GREEK_SMALL_LETTER_LAMDA; + case 'm': return GREEK_SMALL_LETTER_MU; + case 'n': return GREEK_SMALL_LETTER_NU; + case 'x': return GREEK_SMALL_LETTER_XI; + case 'o': return GREEK_SMALL_LETTER_OMICRON; + case 'p': return GREEK_SMALL_LETTER_PI; + case 'r': return GREEK_SMALL_LETTER_RHO; + case '&': return GREEK_SMALL_LETTER_FINAL_SIGMA; + case 's': return GREEK_SMALL_LETTER_SIGMA; + case 't': return GREEK_SMALL_LETTER_TAU; + case 'y': return GREEK_SMALL_LETTER_UPSILON; + case 'f': return GREEK_SMALL_LETTER_PHI; + case 'c': return GREEK_SMALL_LETTER_CHI; + case 'v': return GREEK_SMALL_LETTER_PSI; + case 'w': return GREEK_SMALL_LETTER_OMEGA; +/* + case '': return GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA; + case '': return GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA; + case '': return GREEK_SMALL_LETTER_OMICRON_WITH_TONOS; + case '': return GREEK_SMALL_LETTER_UPSILON_WITH_TONOS; + case '': return GREEK_SMALL_LETTER_OMEGA_WITH_TONOS; + case '': return GREEK_BETA_SYMBOL; + case '': return GREEK_THETA_SYMBOL; + case '': return GREEK_UPSILON_WITH_HOOK_SYMBOL; + case '': return GREEK_UPSILON_WITH_ACUTE_AND_HOOK_SYMBOL; + case '': return GREEK_UPSILON_WITH_DIAERESIS_AND_HOOK_SYMBOL; + case '': return GREEK_PHI_SYMBOL; + case '': return GREEK_PI_SYMBOL; +*/ + default: + if(warn)fprintf( stderr, " COMPOSE: GREEK %04x not defined\n",(int)main); + } + break; + + default: + fprintf( stderr, " COMPOSE: modifier %04x not defined\n",(int)modifier); + } + return (wchar_t)main; +} + +#define UNDEFINED "~" + +/* Arguments: character in Unicode format, type of format to convert to. + Returns: a string containing the Unicode character converted to the chosen + format. This string is statically allocated and should not be freed. + ToDo: better using tables? + */ +const char *decode(wchar_t c, FORMAT type) { + /* static char d; --- js: big bug (missing \0) if &d returned */ + /*FIXME jb static*/ static char bbuf[8*32]; /* space for 8 buffers, rotating */ + /*FIXME jb static*/ static char *buf=bbuf; /* used for UTF8 sequences and undefined codes */ + buf+=32; if(buf>=bbuf+8*32) buf=bbuf; + buf[0]=buf[1]=buf[2]=0; + switch (type) { + case ISO8859_1: + if ( c <= 0xFF ) { /* UNICODE == ISO8859-1 */ + buf[0] = (char)c; + return buf; + } + switch (c) { /* not found in list, but perhaps we can describe it */ + /* todo: add greek. GREEK_SMALL_LETTER_ALPHA = alpha */ + + /* general puctuation */ + case HYPHEN: + return (const char *)"-"; + case FIGURE_DASH: + case EN_DASH: + return (const char *)"--"; + case EM_DASH: + return (const char *)"---"; + case LEFT_SINGLE_QUOTATION_MARK: + return (const char *)"`"; + case RIGHT_SINGLE_QUOTATION_MARK: + return (const char *)"'"; + case SINGLE_LOW_9_QUOTATION_MARK: + return (const char *)","; + case SINGLE_HIGH_REVERSED_9_QUOTATION_MARK: + return (const char *)UNDEFINED; + case LEFT_DOUBLE_QUOTATION_MARK: + return (const char *)"``"; + case RIGHT_DOUBLE_QUOTATION_MARK: + return (const char *)"''"; + case DOUBLE_LOW_9_QUOTATION_MARK: + return (const char *)",,"; + case DOUBLE_HIGH_REVERSED_9_QUOTATION_MARK: + return (const char *)UNDEFINED; + case DAGGER: + return (const char *)"+"; + case DOUBLE_DAGGER: + return (const char *)"*"; + case BULLET: + return (const char *)"*"; + case TRIANGULAR_BULLET: + return (const char *)"*"; + case HYPHENATION_POINT: + return (const char *)"-"; + case HORIZONTAL_ELLIPSIS: + return (const char *)"..."; + case PER_MILLE_SIGN: + return (const char *)"%%"; /* awk! */ + case SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK: + return (const char *)"<"; + case SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK: + return (const char *)">"; + case EURO_CURRENCY_SIGN: + return (const char *)"EUR"; /* change it! */ + + /* ligatures */ + case LATIN_SMALL_LIGATURE_FF: + return (const char *)"ff"; + case LATIN_SMALL_LIGATURE_FI: + return (const char *)"fi"; + case LATIN_SMALL_LIGATURE_FL: + return (const char *)"fl"; + case LATIN_SMALL_LIGATURE_FFI: + return (const char *)"ffi"; + case LATIN_SMALL_LIGATURE_FFL: + return (const char *)"ffl"; + case LATIN_SMALL_LIGATURE_LONG_S_T: + case LATIN_SMALL_LIGATURE_ST: + return (const char *)"st"; + + /* extra */ + case UNKNOWN: + return (const char *)"_"; + case PICTURE: + return (const char *)"_"; /* Due to Mobile OCR */ + + default: + /* snprintf seems to be no standard, so I use insecure sprintf */ + sprintf(buf,"\\code(%04x)",(unsigned)c); + return buf; /* UNDEFINED; */ + } + break; + case TeX: + if ( c >= SPACE && c <= TILDE ) { /* ASCII */ + switch (c) { + case '$': + return (const char *)"\\$"; + case '&': + return (const char *)"\\&"; + case '%': + return (const char *)"\\%"; + case '#': + return (const char *)"\\#"; + case '_': + return (const char *)"\\_"; + case '{': + return (const char *)"\\{"; + case '}': + return (const char *)"\\}"; + case '\\': + return (const char *)"$\\backslash$"; + case '~': + return (const char *)"\\~{}"; + case '^': + return (const char *)"\\^{}"; + default: + buf[0] = (char)c; + return (const char *)buf; + } + } + switch (c) { + /* ISO8859_1 */ + case NO_BREAK_SPACE: + return (const char *)"~"; + case INVERTED_EXCLAMATION_MARK: + return (const char *)"!'"; + case CENT_SIGN: + return (const char *)"\\textcent"; /* \usepackage{textcomp} */ + case POUND_SIGN: + return (const char *)"\\pounds"; + case EURO_CURRENCY_SIGN: + return (const char *)"\\euro"; /* \usepackage{eurosans} */ + case CURRENCY_SIGN: + return (const char *)"\\textcurrency"; /* \usepackage{textcomp} */ + case YEN_SIGN: + return (const char *)"\\textyen"; /* \usepackage{textcomp} */ + case BROKEN_BAR: + return (const char *)"\\textbrokenbar"; /* \usepackage{textcomp} */ + case SECTION_SIGN: + return (const char *)"\\S"; + case DIAERESIS: + return (const char *)"\""; + case COPYRIGHT_SIGN: + return (const char *)"\\copyright"; + case FEMININE_ORDINAL_INDICATOR: + return (const char *)"$^{\\underbar{a}}$"; + case LEFT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK: + return (const char *)"\\flqq{}"; + case NOT_SIGN: + return (const char *)"$\\lnot$"; + case SOFT_HYPHEN: + return (const char *)"\\-"; + case REGISTERED_SIGN: + return (const char *)"\\textregistered";/* \usepackage{textcomp} */ + case MACRON: + return (const char *)"\\textasciimacron";/* \usepackage{textcomp} */ + case DEGREE_SIGN: + return (const char *)"$^{o}$"; + case PLUS_MINUS_SIGN: + return (const char *)"$\\pm$"; + case SUPERSCRIPT_TWO: + return (const char *)"$^{2}$"; + case SUPERSCRIPT_THREE: + return (const char *)"$^{3}$"; + case ACUTE_ACCENT: + return (const char *)"\\( \\prime \\)"; + case MICRO_SIGN: + return (const char *)"$\\mu$"; + case PILCROW_SIGN: + return (const char *)"\\P"; + case MIDDLE_DOT: + return (const char *)"$\\cdot$"; + case CEDILLA: + return (const char *)"\\,"; + case SUPERSCRIPT_ONE: + return (const char *)"$^{1}$"; + case MASCULINE_ORDINAL_INDICATOR: + return (const char *)"$^{\\underbar{o}}$"; + case RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK: + return (const char *)"\\frqq{}"; + case VULGAR_FRACTION_ONE_QUARTER: /* these fractions are not good*/ + return (const char *)"\\( 1\\over 4 \\)"; + case VULGAR_FRACTION_ONE_HALF: + return (const char *)"\\( 1\\over 2 \\)"; + case VULGAR_FRACTION_THREE_QUARTERS: + return (const char *)"\\( 3\\over 4 \\)"; + case INVERTED_QUESTION_MARK: + return (const char *)"?'"; + case LATIN_CAPITAL_LETTER_A_WITH_GRAVE: + return (const char *)"\\`A"; + case LATIN_CAPITAL_LETTER_A_WITH_ACUTE: + return (const char *)"\\'A"; + case LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX: + return (const char *)"\\^A"; + case LATIN_CAPITAL_LETTER_A_WITH_TILDE: + return (const char *)"\\~A"; + case LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS: + return (const char *)"\\\"A"; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + return (const char *)"\\AA"; + case LATIN_CAPITAL_LETTER_AE: + return (const char *)"\\AE"; + case LATIN_CAPITAL_LETTER_C_WITH_CARON: + return (const char *)"\\v{C}"; + case LATIN_CAPITAL_LETTER_C_WITH_CEDILLA: + return (const char *)"\\C"; + case LATIN_CAPITAL_LETTER_E_WITH_GRAVE: + return (const char *)"\\`E"; + case LATIN_CAPITAL_LETTER_E_WITH_ACUTE: + return (const char *)"\\'E"; + case LATIN_CAPITAL_LETTER_E_WITH_CARON: + return (const char *)"\\v{E}"; + case LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX: + return (const char *)"\\^E"; + case LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS: + return (const char *)"\\\"E"; + case LATIN_CAPITAL_LETTER_I_WITH_GRAVE: + return (const char *)"\\`I"; + case LATIN_CAPITAL_LETTER_I_WITH_ACUTE: + return (const char *)"\\'I"; + case LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX: + return (const char *)"\\^I"; + case LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS: + return (const char *)"\\\"I"; + case LATIN_CAPITAL_LETTER_ETH: + return (const char *)UNDEFINED; + case LATIN_CAPITAL_LETTER_N_WITH_TILDE: + return (const char *)"\\~N"; + case LATIN_CAPITAL_LETTER_O_WITH_GRAVE: + return (const char *)"\\`O"; + case LATIN_CAPITAL_LETTER_O_WITH_ACUTE: + return (const char *)"\\'O"; + case LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX: + return (const char *)"\\^O"; + case LATIN_CAPITAL_LETTER_O_WITH_TILDE: + return (const char *)"\\~O"; + case LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS: + return (const char *)"\\\"O"; + case MULTIPLICATION_SIGN: + return (const char *)"$\\times$"; + case LATIN_CAPITAL_LETTER_O_WITH_STROKE: + return (const char *)"\\O"; + case LATIN_CAPITAL_LETTER_S_WITH_CARON: + return (const char *)"\\v{S}"; + case LATIN_CAPITAL_LETTER_U_WITH_GRAVE: + return (const char *)"\\`U"; + case LATIN_CAPITAL_LETTER_U_WITH_ACUTE: + return (const char *)"\\'U"; + case LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX: + return (const char *)"\\^U"; + case LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS: + return (const char *)"\\\"U"; + case LATIN_CAPITAL_LETTER_Y_WITH_ACUTE: + return (const char *)"\\'Y"; + case LATIN_CAPITAL_LETTER_Z_WITH_CARON: + return (const char *)"\\v{Z}"; + case LATIN_CAPITAL_LETTER_THORN: + return (const char *)UNDEFINED; + case LATIN_SMALL_LETTER_SHARP_S: + return (const char *)"\\ss"; + case LATIN_SMALL_LETTER_A_WITH_GRAVE: + return (const char *)"\\`a"; + case LATIN_SMALL_LETTER_A_WITH_ACUTE: + return (const char *)"\\'a"; + case LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX: + return (const char *)"\\^a"; + case LATIN_SMALL_LETTER_A_WITH_TILDE: + return (const char *)"\\~a"; + case LATIN_SMALL_LETTER_A_WITH_DIAERESIS: + return (const char *)"\\\"a"; + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + return (const char *)"\\aa"; + case LATIN_SMALL_LETTER_AE: + return (const char *)"\\ae"; + case LATIN_SMALL_LETTER_C_WITH_CARON: + return (const char *)"\\v{c}"; + case LATIN_SMALL_LETTER_C_WITH_CEDILLA: + return (const char *)"\\c"; + case LATIN_SMALL_LETTER_E_WITH_GRAVE: + return (const char *)"\\`e"; + case LATIN_SMALL_LETTER_E_WITH_ACUTE: + return (const char *)"\\'e"; + case LATIN_SMALL_LETTER_E_WITH_CARON: + return (const char *)"\\v{e}"; + case LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX: + return (const char *)"\\^e"; + case LATIN_SMALL_LETTER_E_WITH_DIAERESIS: + return (const char *)"\\\"e"; + case LATIN_SMALL_LETTER_I_WITH_GRAVE: + return (const char *)"\\`i"; + case LATIN_SMALL_LETTER_I_WITH_ACUTE: + return (const char *)"\\'i"; + case LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX: + return (const char *)"\\^i"; + case LATIN_SMALL_LETTER_I_WITH_DIAERESIS: + return (const char *)"\\\"i"; + case LATIN_SMALL_LETTER_ETH: + return (const char *)UNDEFINED; + case LATIN_SMALL_LETTER_N_WITH_TILDE: + return (const char *)"\\~n"; + case LATIN_SMALL_LETTER_O_WITH_GRAVE: + return (const char *)"\\`o"; + case LATIN_SMALL_LETTER_O_WITH_ACUTE: + return (const char *)"\\'o"; + case LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX: + return (const char *)"\\^o"; + case LATIN_SMALL_LETTER_O_WITH_TILDE: + return (const char *)"\\~o"; + case LATIN_SMALL_LETTER_O_WITH_DIAERESIS: + return (const char *)"\\\"o"; + case DIVISION_SIGN: + return (const char *)"$\\div$"; + case LATIN_SMALL_LETTER_O_WITH_STROKE: + return (const char *)"\\o"; + case LATIN_SMALL_LETTER_S_WITH_CARON: + return (const char *)"\\v{s}"; + case LATIN_SMALL_LETTER_U_WITH_GRAVE: + return (const char *)"\\`u"; + case LATIN_SMALL_LETTER_U_WITH_ACUTE: + return (const char *)"\\'u"; + case LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX: + return (const char *)"\\^u"; + case LATIN_SMALL_LETTER_U_WITH_DIAERESIS: + return (const char *)"\\\"u"; + case LATIN_SMALL_LETTER_Y_WITH_ACUTE: + return (const char *)"\\'y"; + case LATIN_SMALL_LETTER_THORN: + return (const char *)UNDEFINED; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + return (const char *)"\\\"y"; + case LATIN_SMALL_LETTER_Z_WITH_CARON: + return (const char *)"\\v{z}"; + + /* greek */ + /* some (punctuation, accents, accented capital) greek letters missing*/ + case GREEK_CAPITAL_LETTER_ALPHA: + return (const char *)"A"; + case GREEK_CAPITAL_LETTER_BETA: + return (const char *)"B"; + case GREEK_CAPITAL_LETTER_GAMMA: + return (const char *)"\\( \\Gamma \\)"; + case GREEK_CAPITAL_LETTER_DELTA: + return (const char *)"\\( \\Delta \\)"; + case GREEK_CAPITAL_LETTER_EPSILON: + return (const char *)"E"; + case GREEK_CAPITAL_LETTER_ZETA: + return (const char *)"Z"; + case GREEK_CAPITAL_LETTER_ETA: + return (const char *)"H"; + case GREEK_CAPITAL_LETTER_THETA: + return (const char *)"\\( \\Theta \\)"; + case GREEK_CAPITAL_LETTER_IOTA: + return (const char *)"I"; + case GREEK_CAPITAL_LETTER_KAPPA: + return (const char *)"K"; + case GREEK_CAPITAL_LETTER_LAMDA: + return (const char *)"\\( \\Lambda \\)"; + case GREEK_CAPITAL_LETTER_MU: + return (const char *)"M"; + case GREEK_CAPITAL_LETTER_NU: + return (const char *)"N"; + case GREEK_CAPITAL_LETTER_XI: + return (const char *)"\\( \\Xi \\)"; + case GREEK_CAPITAL_LETTER_OMICRON: + return (const char *)"O"; + case GREEK_CAPITAL_LETTER_PI: + return (const char *)"\\( \\Pi \\)"; + case GREEK_CAPITAL_LETTER_RHO: + return (const char *)"P"; + case GREEK_CAPITAL_LETTER_SIGMA: + return (const char *)"\\( \\Sigma \\)"; + case GREEK_CAPITAL_LETTER_TAU: + return (const char *)"T"; + case GREEK_CAPITAL_LETTER_UPSILON: + return (const char *)"\\( \\Upsilon \\)"; + case GREEK_CAPITAL_LETTER_PHI: + return (const char *)"\\( \\Phi \\)"; + case GREEK_CAPITAL_LETTER_CHI: + return (const char *)"\\( \\Chi \\)"; + case GREEK_CAPITAL_LETTER_PSI: + return (const char *)"\\( \\Psi \\)"; + case GREEK_CAPITAL_LETTER_OMEGA: + return (const char *)"\\( \\Omega \\)"; + case GREEK_CAPITAL_LETTER_IOTA_WITH_DIALYTIKA: + return (const char *)UNDEFINED; + case GREEK_CAPITAL_LETTER_UPSILON_WITH_DIALYTIKA: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_ALPHA_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_EPSILON_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_ETA_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_IOTA_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA_AND_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_ALPHA: + return (const char *)"\\( \\alpha \\)"; + case GREEK_SMALL_LETTER_BETA: + return (const char *)"\\( \\beta \\)"; + case GREEK_SMALL_LETTER_GAMMA: + return (const char *)"\\( \\gamma \\)"; + case GREEK_SMALL_LETTER_DELTA: + return (const char *)"\\( \\delta \\)"; + case GREEK_SMALL_LETTER_EPSILON: + return (const char *)"\\( \\epsilon \\)"; + case GREEK_SMALL_LETTER_ZETA: + return (const char *)"\\( \\zeta \\)"; + case GREEK_SMALL_LETTER_ETA: + return (const char *)"\\( \\eta \\)"; + case GREEK_SMALL_LETTER_THETA: + return (const char *)"\\( \\theta \\)"; + case GREEK_SMALL_LETTER_IOTA: + return (const char *)"\\( \\iota \\)"; + case GREEK_SMALL_LETTER_KAPPA: + return (const char *)"\\( \\kappa \\)"; + case GREEK_SMALL_LETTER_LAMDA: + return (const char *)"\\( \\lambda \\)"; + case GREEK_SMALL_LETTER_MU: + return (const char *)"\\( \\mu \\)"; + case GREEK_SMALL_LETTER_NU: + return (const char *)"\\( \\nu \\)"; + case GREEK_SMALL_LETTER_XI: + return (const char *)"\\( \\xi \\)"; + case GREEK_SMALL_LETTER_OMICRON: + return (const char *)"\\( \\omicron \\)"; + case GREEK_SMALL_LETTER_PI: + return (const char *)"\\( \\pi \\)"; + case GREEK_SMALL_LETTER_RHO: + return (const char *)"\\( \\rho \\)"; + case GREEK_SMALL_LETTER_FINAL_SIGMA: + return (const char *)"\\( \\varsigma \\)"; + case GREEK_SMALL_LETTER_SIGMA: + return (const char *)"\\( \\sigma \\)"; + case GREEK_SMALL_LETTER_TAU: + return (const char *)"\\( \\tau \\)"; + case GREEK_SMALL_LETTER_UPSILON: + return (const char *)"\\( \\upsilon \\)"; + case GREEK_SMALL_LETTER_PHI: + return (const char *)"\\( \\varphi \\)"; + case GREEK_SMALL_LETTER_CHI: + return (const char *)"\\( \\chi \\)"; + case GREEK_SMALL_LETTER_PSI: + return (const char *)"\\( \\psi \\)"; + case GREEK_SMALL_LETTER_OMEGA: + return (const char *)"\\( \\omega \\)"; + case GREEK_SMALL_LETTER_IOTA_WITH_DIALYTIKA: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_UPSILON_WITH_DIALYTIKA: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_OMICRON_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_UPSILON_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_SMALL_LETTER_OMEGA_WITH_TONOS: + return (const char *)UNDEFINED; + case GREEK_BETA_SYMBOL: + return (const char *)UNDEFINED; + case GREEK_THETA_SYMBOL: + return (const char *)"\\( \\vartheta \\)"; + case GREEK_UPSILON_WITH_HOOK_SYMBOL: + return (const char *)UNDEFINED; + case GREEK_UPSILON_WITH_ACUTE_AND_HOOK_SYMBOL: + return (const char *)UNDEFINED; + case GREEK_UPSILON_WITH_DIAERESIS_AND_HOOK_SYMBOL: + return (const char *)UNDEFINED; + case GREEK_PHI_SYMBOL: + return (const char *)"\\( \\phi \\)"; + case GREEK_PI_SYMBOL: + return (const char *)"\\( \\varpi \\)"; + /* and some greek letters missing*/ + + /* punctuation (partial) */ + case HYPHEN: + return (const char *)"-"; + case NON_BREAKING_HYPHEN: + return (const char *)UNDEFINED; + case FIGURE_DASH: + case EN_DASH: + return (const char *)"--"; + case EM_DASH: + return (const char *)"---"; + case HORIZONTAL_BAR: + return (const char *)UNDEFINED; + case LEFT_SINGLE_QUOTATION_MARK: + return (const char *)"`"; + case RIGHT_SINGLE_QUOTATION_MARK: + return (const char *)"'"; + case SINGLE_LOW_9_QUOTATION_MARK: + return (const char *)"\\glq{}"; + case SINGLE_HIGH_REVERSED_9_QUOTATION_MARK: + return (const char *)UNDEFINED; + case LEFT_DOUBLE_QUOTATION_MARK: + return (const char *)"``"; + case RIGHT_DOUBLE_QUOTATION_MARK: + return (const char *)"''"; + case DOUBLE_LOW_9_QUOTATION_MARK: + return (const char *)"\\glqq{}"; + case DOUBLE_HIGH_REVERSED_9_QUOTATION_MARK: + return (const char *)UNDEFINED; + case DAGGER: + return (const char *)"\\dag"; + case DOUBLE_DAGGER: + return (const char *)"\\ddag"; + case BULLET: + return (const char *)"$\\bullet$"; + case TRIANGULAR_BULLET: + return (const char *)"$\\blacktriangleright"; + case HYPHENATION_POINT: + return (const char *)"\\-"; + case HORIZONTAL_ELLIPSIS: + return (const char *)"\\ldots"; + case PER_MILLE_SIGN: + return (const char *)UNDEFINED; + case SINGLE_LEFT_POINTING_ANGLE_QUOTATION_MARK: + return (const char *)"\\flq{}"; + case SINGLE_RIGHT_POINTING_ANGLE_QUOTATION_MARK: + return (const char *)"\\frq{}"; + /* ligatures */ + case LATIN_SMALL_LIGATURE_FF: + return (const char *)"ff"; + case LATIN_SMALL_LIGATURE_FI: + return (const char *)"fi"; + case LATIN_SMALL_LIGATURE_FL: + return (const char *)"fl"; + case LATIN_SMALL_LIGATURE_FFI: + return (const char *)"ffi"; + case LATIN_SMALL_LIGATURE_FFL: + return (const char *)"ffl"; + case LATIN_SMALL_LIGATURE_LONG_S_T: + case LATIN_SMALL_LIGATURE_ST: + return (const char *)"st"; + /* reserved */ + case 0: + return (const char *)""; + case UNKNOWN: + return (const char *)"\\_"; + case PICTURE: + return (const char *)"(PICTURE)"; + default: + /* snprintf seems to be no standard, so I use insecure sprintf */ + sprintf(buf,"\\symbol{%u}",(unsigned)c); + return buf; /* UNDEFINED; */ + } + case HTML: + if ( c >= SPACE && c <= TILDE ) { /* ASCII */ + switch (c) { + case '&': + return (const char *)"&"; + /* semicolon must not be coded */ + case '\'': + return (const char *)"'"; + case '"': + return (const char *)"""; + case '<': + return (const char *)"<"; + case '>': + return (const char *)">"; + } + buf[0] = (char)c; + return buf; + } + switch (c) { + case PICTURE: + return (const char *)""; + case UNKNOWN: + return (const char *)"_"; /* better use colored symbol? */ + case LINE_FEED: + return (const char *)"
"; /* \n handled somwhere else? */ + case FORM_FEED: + case CARRIAGE_RETURN: + return (const char *)"
"; + case NO_BREAK_SPACE: + return (const char *)""; + case INVERTED_EXCLAMATION_MARK: + return (const char *)"¡"; + case CENT_SIGN: + return (const char *)"¢"; + case POUND_SIGN: + return (const char *)"£"; + case CURRENCY_SIGN: + return (const char *)"¤"; + case YEN_SIGN: + return (const char *)"¥"; + case BROKEN_BAR: + return (const char *)"¦"; + case SECTION_SIGN: + return (const char *)"§"; + case DIAERESIS: + return (const char *)"¨"; + case COPYRIGHT_SIGN: + return (const char *)"©"; + case FEMININE_ORDINAL_INDICATOR: + return (const char *)"ªem;"; + case LEFT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK: + return (const char *)"«"; + case NOT_SIGN: + return (const char *)"¬"; + case SOFT_HYPHEN: + return (const char *)"­"; + case REGISTERED_SIGN: + return (const char *)"®"; + case MACRON: + return (const char *)"¯"; + case DEGREE_SIGN: + return (const char *)"°"; + case PLUS_MINUS_SIGN: + return (const char *)"±"; + case SUPERSCRIPT_TWO: + return (const char *)"²"; + case SUPERSCRIPT_THREE: + return (const char *)"³"; + case ACUTE_ACCENT: + return (const char *)"´"; + case MICRO_SIGN: + return (const char *)"µ"; + case PILCROW_SIGN: + return (const char *)"¶"; + case MIDDLE_DOT: + return (const char *)"·"; + case CEDILLA: + return (const char *)"¸"; + case SUPERSCRIPT_ONE: + return (const char *)"¹"; + case MASCULINE_ORDINAL_INDICATOR: + return (const char *)"º"; + case RIGHT_POINTING_DOUBLE_ANGLE_QUOTATION_MARK: + return (const char *)"»"; + case VULGAR_FRACTION_ONE_QUARTER: + return (const char *)"¼"; + case VULGAR_FRACTION_ONE_HALF: + return (const char *)"½"; + case VULGAR_FRACTION_THREE_QUARTERS: + return (const char *)"¾"; + case INVERTED_QUESTION_MARK: + return (const char *)"¿"; + case LATIN_CAPITAL_LETTER_A_WITH_GRAVE: + return (const char *)"À"; + case LATIN_CAPITAL_LETTER_A_WITH_ACUTE: + return (const char *)"Á"; + case LATIN_CAPITAL_LETTER_A_WITH_BREVE: + return (const char *)"Ă"; + case LATIN_CAPITAL_LETTER_A_WITH_CIRCUMFLEX: + return (const char *)"Â"; + case LATIN_CAPITAL_LETTER_A_WITH_TILDE: + return (const char *)"Ã"; + case LATIN_CAPITAL_LETTER_A_WITH_DIAERESIS: + return (const char *)"Ä"; + case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE: + return (const char *)"Å"; + case LATIN_CAPITAL_LETTER_AE: + return (const char *)"Æ"; + case LATIN_CAPITAL_LETTER_C_WITH_CARON: + return (const char *)"Č"; + case LATIN_CAPITAL_LETTER_C_WITH_CEDILLA: + return (const char *)"Ç"; + case LATIN_CAPITAL_LETTER_E_WITH_GRAVE: + return (const char *)"È"; + case LATIN_CAPITAL_LETTER_E_WITH_ACUTE: + return (const char *)"É"; + case LATIN_CAPITAL_LETTER_E_WITH_CARON: + return (const char *)"Ě"; + case LATIN_CAPITAL_LETTER_E_WITH_CIRCUMFLEX: + return (const char *)"Ê"; + case LATIN_CAPITAL_LETTER_E_WITH_DIAERESIS: + return (const char *)"Ë"; + case LATIN_CAPITAL_LETTER_I_WITH_GRAVE: + return (const char *)"Ì"; + case LATIN_CAPITAL_LETTER_I_WITH_ACUTE: + return (const char *)"Í"; + case LATIN_CAPITAL_LETTER_I_WITH_CIRCUMFLEX: + return (const char *)"Î"; + case LATIN_CAPITAL_LETTER_I_WITH_DIAERESIS: + return (const char *)"Ï"; + case LATIN_CAPITAL_LETTER_ETH: + return (const char *)"Ð"; + case LATIN_CAPITAL_LETTER_N_WITH_TILDE: + return (const char *)"Ñ"; + case LATIN_CAPITAL_LETTER_O_WITH_GRAVE: + return (const char *)"Ò"; + case LATIN_CAPITAL_LETTER_O_WITH_ACUTE: + return (const char *)"Ó"; + case LATIN_CAPITAL_LETTER_O_WITH_CIRCUMFLEX: + return (const char *)"Ô"; + case LATIN_CAPITAL_LETTER_O_WITH_TILDE: + return (const char *)"Õ"; + case LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS: + return (const char *)"Ö"; + case MULTIPLICATION_SIGN: + return (const char *)"×"; + case LATIN_CAPITAL_LETTER_O_WITH_STROKE: + return (const char *)"Ø"; + case LATIN_CAPITAL_LETTER_S_WITH_CARON: + return (const char *)"Š"; + case LATIN_CAPITAL_LETTER_U_WITH_GRAVE: + return (const char *)"Ù"; + case LATIN_CAPITAL_LETTER_U_WITH_ACUTE: + return (const char *)"Ú"; + case LATIN_CAPITAL_LETTER_U_WITH_CIRCUMFLEX: + return (const char *)"Û"; + case LATIN_CAPITAL_LETTER_U_WITH_DIAERESIS: + return (const char *)"Ü"; + case LATIN_CAPITAL_LETTER_Y_WITH_ACUTE: + return (const char *)"Ý"; + case LATIN_CAPITAL_LETTER_Z_WITH_CARON: + return (const char *)"Ž"; + case LATIN_CAPITAL_LETTER_THORN: + return (const char *)"Þ"; + case LATIN_SMALL_LETTER_SHARP_S: + return (const char *)"ß"; + case LATIN_SMALL_LETTER_A_WITH_GRAVE: + return (const char *)"à"; + case LATIN_SMALL_LETTER_A_WITH_ACUTE: + return (const char *)"á"; + case LATIN_SMALL_LETTER_A_WITH_BREVE: + return (const char *)"ă"; + case LATIN_SMALL_LETTER_A_WITH_CARON: + return (const char *)"&acaron;"; + case LATIN_SMALL_LETTER_A_WITH_CIRCUMFLEX: + return (const char *)"â"; + case LATIN_SMALL_LETTER_A_WITH_TILDE: + return (const char *)"ã"; + case LATIN_SMALL_LETTER_A_WITH_DIAERESIS: + return (const char *)"ä"; + case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE: + return (const char *)"å"; + case LATIN_SMALL_LETTER_AE: + return (const char *)"æ"; + case LATIN_SMALL_LETTER_C_WITH_CARON: + return (const char *)"č"; + case LATIN_SMALL_LETTER_C_WITH_CEDILLA: + return (const char *)"ç"; + case LATIN_SMALL_LETTER_E_WITH_GRAVE: + return (const char *)"è"; + case LATIN_SMALL_LETTER_E_WITH_ACUTE: + return (const char *)"é"; + case LATIN_SMALL_LETTER_E_WITH_CARON: + return (const char *)"ě"; + case LATIN_SMALL_LETTER_E_WITH_CIRCUMFLEX: + return (const char *)"ê"; + case LATIN_SMALL_LETTER_E_WITH_DIAERESIS: + return (const char *)"ë"; + case LATIN_SMALL_LETTER_I_WITH_GRAVE: + return (const char *)"ì"; + case LATIN_SMALL_LETTER_I_WITH_ACUTE: + return (const char *)"í"; + case LATIN_SMALL_LETTER_I_WITH_CIRCUMFLEX: + return (const char *)"î"; + case LATIN_SMALL_LETTER_I_WITH_DIAERESIS: + return (const char *)"ï"; + case LATIN_SMALL_LETTER_ETH: + return (const char *)"ð"; + case LATIN_SMALL_LETTER_N_WITH_TILDE: + return (const char *)"ñ"; + case LATIN_SMALL_LETTER_O_WITH_GRAVE: + return (const char *)"ò"; + case LATIN_SMALL_LETTER_O_WITH_ACUTE: + return (const char *)"ó"; + case LATIN_SMALL_LETTER_O_WITH_CIRCUMFLEX: + return (const char *)"ô"; + case LATIN_SMALL_LETTER_O_WITH_TILDE: + return (const char *)"õ"; + case LATIN_SMALL_LETTER_O_WITH_DIAERESIS: + return (const char *)"ö"; + case DIVISION_SIGN: + return (const char *)"÷"; + case LATIN_SMALL_LETTER_O_WITH_STROKE: + return (const char *)"ø"; + case LATIN_SMALL_LETTER_S_WITH_CARON: + return (const char *)"š"; + case LATIN_SMALL_LETTER_U_WITH_GRAVE: + return (const char *)"ù"; + case LATIN_SMALL_LETTER_U_WITH_ACUTE: + return (const char *)"ú"; + case LATIN_SMALL_LETTER_U_WITH_CIRCUMFLEX: + return (const char *)"û"; + case LATIN_SMALL_LETTER_U_WITH_DIAERESIS: + return (const char *)"ü"; + case LATIN_SMALL_LETTER_Y_WITH_ACUTE: + return (const char *)"ý"; + case LATIN_SMALL_LETTER_THORN: + return (const char *)"þ"; + case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS: + return (const char *)"ÿ"; + case LATIN_SMALL_LETTER_Z_WITH_CARON: + return (const char *)"ž"; + case EURO_CURRENCY_SIGN: + return (const char *)"€"; + case 0: + return (const char *)""; + default: + sprintf(buf,"&#%u;",(unsigned)c); + return buf; /* undefined */ + } + /* break; unreachable code */ + case XML: /* only 5 &xxx;-ENTITIES ar defined by default */ + if ( c >= SPACE && c <= TILDE ) { /* ASCII */ + switch (c) { + case '&': + return (const char *)"&"; + case '\'': + return (const char *)"'"; + case '"': + return (const char *)"""; + case '<': + return (const char *)"<"; + case '>': + return (const char *)">"; + } + buf[0] = (char)c; + return buf; + } + switch (c) { /* subject of change! */ + case PICTURE: + return (const char *)"(PICTURE)"; + case UNKNOWN: + return (const char *)"_"; /* better use colored symbol? */ + case LINE_FEED: /* \n handled somwhere else? */ + case FORM_FEED: + case CARRIAGE_RETURN: + return (const char *)"
"; + case NO_BREAK_SPACE: + return (const char *)"
"; + case 0: + return (const char *)""; + default: + sprintf(buf,"&#x%03x;",(unsigned)c); + return buf; /* undefined */ + } + /* break; unreachable code */ + case SGML: + switch (c) { + default: + sprintf(buf,"&#%u;",(unsigned)c); + return buf; /* UNDEFINED */ + } + /* break; unreachable code */ + case ASCII: /* mainly used for debugging */ + if ( c=='\n' || (c>= 0x20 && c <= 0x7F) ) { + buf[0] = (char)c; + return buf; + } + switch (c) { + /* extra */ + case UNKNOWN: + return (const char *)"(?)"; + case PICTURE: + return (const char *)"(?)"; + + default: + /* snprintf seems to be no standard, so I use insecure sprintf */ + if ((unsigned)c>255) sprintf(buf,"(0x%04x)",(unsigned)c); + else sprintf(buf,"(0x%02x)",(unsigned)c); + return buf; /* UNDEFINED; */ + } + /* break; unreachable code */ + default: /* use UTF8 as default, test with xterm -u8 */ + /* extra */ + if ( c == UNKNOWN ) return (const char *)"_"; + if ( c == PICTURE ) return (const char *)"_"; /* Due to Mobile OCR */ + if ( c <= (wchar_t)0x0000007F ) { /* UTF8 == 7bit ASCII */ + buf[0] = (char)c; + return buf; + } + if ( c <= (wchar_t)0x000007FF ) { /* UTF8 == 11bit */ + buf[0] = (char)(0xc0|((c>> 6) & 0x1f)); /* 110xxxxx */ + buf[1] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */ + buf[2] = (char)0; /* terminate string */ + return buf; + } + /* wchar_t is 16bit for Borland-C !? Jan07 */ + if ( c <= (wchar_t)0x0000FFFF ) { /* UTF8 == 16bit */ + buf[0] = (char)(0xe0|((c>>12) & 0x0f)); /* 1110xxxx */ + buf[1] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */ + buf[2] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */ + buf[3] = (char)0; /* terminate string */ + return buf; + } + if ( c <= (wchar_t)0x001FFFFF ) { /* UTF8 == 21bit */ + buf[0] = (char)(0xf0|((c>>18) & 0x07)); /* 11110xxx */ + buf[1] = (char)(0x80|((c>>12) & 0x3f)); /* 10xxxxxx */ + buf[2] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */ + buf[3] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */ + buf[4] = (char)0; /* terminate string */ + return buf; + } + if ( c <= (wchar_t)0x03FFFFFF ) { /* UTF8 == 26bit */ + buf[0] = (char)(0xf8|((c>>24) & 0x03)); /* 111110xx */ + buf[1] = (char)(0x80|((c>>18) & 0x3f)); /* 10xxxxxx */ + buf[2] = (char)(0x80|((c>>12) & 0x3f)); /* 10xxxxxx */ + buf[3] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */ + buf[4] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */ + buf[5] = (char)0; /* terminate string */ + return buf; + } + if ( c <= (wchar_t)0x7FFFFFFF ) { /* UTF8 == 31bit */ + buf[0] = (char)(0xfc|((c>>30) & 0x01)); /* 1111110x */ + buf[1] = (char)(0x80|((c>>24) & 0x3f)); /* 10xxxxxx */ + buf[2] = (char)(0x80|((c>>18) & 0x3f)); /* 10xxxxxx */ + buf[3] = (char)(0x80|((c>>12) & 0x3f)); /* 10xxxxxx */ + buf[4] = (char)(0x80|((c>> 6) & 0x3f)); /* 10xxxxxx */ + buf[5] = (char)(0x80|( c & 0x3f)); /* 10xxxxxx */ + buf[6] = (char)0; /* terminate string */ + return buf; + } + return (const char *)UNDEFINED; + } +}