00001 /* AbiSource Application Framework 00002 * Copyright (C) 2000 00003 * Orignially by Vlad Harchev <hvv@hippo.ru> 00004 * 00005 * This program is free software; you can redistribute it and/or 00006 * modify it under the terms of the GNU General Public License 00007 * as published by the Free Software Foundation; either version 2 00008 * of the License, or (at your option) any later version. 00009 * 00010 * This program is distributed in the hope that it will be useful, 00011 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00012 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00013 * GNU General Public License for more details. 00014 * 00015 * You should have received a copy of the GNU General Public License 00016 * along with this program; if not, write to the Free Software 00017 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 00018 * 02110-1301 USA. 00019 */ 00020 00021 #ifndef XAP_ENCMGR_H 00022 #define XAP_ENCMGR_H 00023 00024 /* pre-emptive dismissal; ut_types.h is needed by just about everything, 00025 * so even if it's commented out in-file that's still a lot of work for 00026 * the preprocessor to do... 00027 */ 00028 #ifndef UT_TYPES_H 00029 #include "ut_types.h" 00030 #endif 00031 00032 #ifdef HAVE_EXPAT 00033 #ifndef gchar 00034 typedef gchar gchar; 00035 #endif 00036 #include <expat.h> 00037 #else 00038 #define XML_Encoding void 00039 #endif 00040 00041 #include "ut_bijection.h" 00042 #include "ut_iconv.h" 00043 #include "ut_xml.h" 00044 00045 struct ABI_EXPORT XAP_LangInfo 00046 { 00047 /*no memeber can have NULL value. If string is empty, then value is 00048 not defined. All fields are strings to simplify searches. 00049 */ 00050 enum fieldidx { longname_idx, /*this field is not empty*/ 00051 isoshortname_idx /*ISO*/, 00052 countrycode_idx, /*e.g. the "US" in "en-US" */ 00053 winlangcode_idx, /*0x400 + atoi() it to get a value*/ 00054 macname_idx, /*e.g. "langRussian" or empty*/ 00055 maclangcode_idx, /*atoi() it to get a value*/ 00056 max_idx = maclangcode_idx }; 00057 00058 const char* fields[max_idx+1]; 00059 00060 size_t outerQuoteIdx; 00061 size_t innerQuoteIdx; 00062 }; 00063 00064 struct ABI_EXPORT XAP_SmartQuoteStyle 00065 { 00066 UT_UCSChar leftQuote; 00067 UT_UCSChar rightQuote; 00068 }; 00069 00070 00071 class ABI_EXPORT XAP_EncodingManager 00072 { 00073 public: 00074 /* 00075 These shouldn't return NULL. Don't g_free or write to returned strings. 00076 The strings should be uppercased (extra font tarballs assume this). 00077 */ 00078 virtual const char* getNativeEncodingName() const; 00079 virtual const char* getNativeSystemEncodingName() const; 00080 virtual const char* getNative8BitEncodingName() const; 00081 virtual const char* getNativeNonUnicodeEncodingName() const; 00082 00083 /* 00084 These can return NULL. Don't g_free or write to returned strings. 00085 The strings should be uppercased (extra font tarballs assume this). 00086 */ 00087 virtual const char* getNativeUnicodeEncodingName() const; 00088 virtual const char* getUCS2BEName() const; 00089 virtual const char* getUCS2LEName() const; 00090 virtual const char* getUCS4BEName() const; 00091 virtual const char* getUCS4LEName() const; 00092 00093 /* 00094 This should return true for any Unicode locale: 00095 UTF-8 on *nix, UCS-2 on Windows, etc 00096 */ 00097 inline virtual bool isUnicodeLocale() const {return m_bIsUnicodeLocale;} 00098 00099 /* 00100 This shouldn't return NULL. Don't g_free or write to returned string. 00101 Returns ISO two-letter name like "en" 00102 */ 00103 virtual const char* getLanguageISOName() const; 00104 00105 /* 00106 This can return NULL. Don't g_free or write to returned string. 00107 Returns ISO two-letter territory name like "UK". 00108 */ 00109 00110 virtual const char* getLanguageISOTerritory() const; 00111 00112 #if 0 00113 /* 00114 for exporting to Tex - in order to provide proper argument for 00115 {inputenc}, e.g. \usepackage[koi8-r]{inputenc} 00116 If NULL is returned, then package 'inputenc' is not used at all. 00117 */ 00118 virtual const char* getNativeTexEncodingName() const; 00119 #else 00120 virtual void placeholder() {};//to be removed 00121 #endif 00122 /* 00123 Should return "\n"-terminated prologue that loads required packages, 00124 etc. 00125 */ 00126 virtual const char* getTexPrologue() const; 00127 00128 /*these return 0 if they can't convert*/ 00129 /* 00130 This won't work for c>0xff. Use UT_Mbtowc! 00131 */ 00132 virtual UT_UCSChar try_nativeToU(UT_UCSChar c) const; 00133 /* 00134 If returned value is > 0xff, then multibyte seq is returned by iconv 00135 and return value mean nothing (except that it notes that singlebyte 00136 encoding can't be used for this character. 00137 */ 00138 virtual UT_UCSChar try_UToNative(UT_UCSChar c) const; 00139 virtual UT_UCSChar try_UToLatin1(UT_UCSChar c) const; 00140 00141 /*these are used for reading/writing of doc and rtf files. */ 00142 virtual UT_UCSChar try_WindowsToU(UT_UCSChar c) const; 00143 virtual UT_UCSChar try_UToWindows(UT_UCSChar c) const; 00144 00145 00146 00147 00148 virtual char fallbackChar(UT_UCSChar c) const; 00149 static XAP_EncodingManager *get_instance(); 00150 void Delete_instance(); 00151 00152 /* This tries to approximate the character with the string, e.g. 00153 horizontal-elipsis -> "...". Returns # of chars written or 0 if can't. 00154 The returned string will be in ascii (i.e. it will be representable 00155 in any encoding). 00156 If 'max_length' is 1, then approrixmation with exactly one character is 00157 requested. If 'max_length' is not 1, then it's rather large (e.g. 16) - 00158 so there is no need to check whether there is enough g_free space in the 00159 buffer. 00160 */ 00161 virtual UT_uint32 approximate(char* out,UT_uint32 max_length,UT_UCSChar c) const; 00162 00163 /* 00164 This should return 0 if it's unknown. Used only when exporting the 00165 document. 00166 */ 00167 virtual UT_uint32 getWinLanguageCode() const; 00168 00169 /* 00170 0 means Ascii. See _CHARSET macros (e.g RUSSIAN_CHARSET) in wingdi.h 00171 from Win32 SDK. Used only when exporting RTF. 00172 */ 00173 virtual UT_uint32 getWinCharsetCode() const; 00174 00175 /*can be called several times - e.g. by constructor of port-specific 00176 implementation. */ 00177 virtual void initialize(); 00178 /* 00179 returns 1 if current langauge is CJK (chinese, japanese, korean) 00180 */ 00181 inline virtual bool cjk_locale() const { return is_cjk_; } 00182 00183 /* 00184 returns true if there is no distinction between upper and lower 00185 letters. 00186 */ 00187 virtual bool single_case() const; 00188 00189 /* 00190 returns true if all letters are non-CJK. Under non-cjk locales 00191 it returns 1. Under cjk locales, returns 1 if all chars <0xff 00192 in that range. 00193 */ 00194 virtual bool noncjk_letters(const UT_UCSChar* str,int len) const; 00195 00196 /* 00197 * Returns true if a break between c[0] and c[1] is permissible. 00198 */ 00199 virtual bool canBreakBetween(const UT_UCS4Char c[2]) const; 00200 00201 /* 00202 This should be as precise as possible. 00203 */ 00204 virtual bool is_cjk_letter(UT_UCSChar c) const; 00205 00206 /* 00207 This is rather smart wrapper for wvLIDToCodePageConverter. 00208 Not all CP* are known by current iconv's, so this function 00209 will try first to return charset string that iconv knows. 00210 */ 00211 virtual const char* charsetFromCodepage(int lid) const; 00212 00213 /* 00214 This is convert charset to codepage. 00215 */ 00216 virtual const char* CodepageFromCharset(const char *charset) const; 00217 00218 /* 00219 returns charsetFromCodepage( getWinLanguageCode() ) 00220 */ 00221 virtual const char* WindowsCharsetName() const; 00222 00223 /* these use try_ methods, and if they fail, fallbackChar() is returned*/ 00224 UT_UCSChar nativeToU(UT_UCSChar c) const; 00225 UT_UCSChar UToNative(UT_UCSChar c) const; 00226 UT_UCSChar WindowsToU(UT_UCSChar c) const; 00227 UT_UCSChar UToWindows(UT_UCSChar c) const; 00228 00229 /* 00230 this will convert the string 'in' from charset 'charset' to 00231 native charset. This is mostly shorthand function. 00232 it returns ptr to translated string - it will be either 00233 'in' or will be in static storage that shouldn't be freed. 00234 Allowed values for 'charset' include "" and NULL in which case 00235 'in' be returned. Don't ask to translate strings longer than 2K. 00236 This code is mostly used for translating localized menuitems to 00237 the proper charset. 00238 00239 Of course it uses iconv internally. 00240 */ 00241 const char* strToNative(const char* in,const char* charset,bool bReverse=false, bool bUseSysEncoding=false) const; 00242 00243 /* 00244 Same as above, but it will use buffer provided by caller. 00245 */ 00246 const char* strToNative(const char* in,const char* charset,char* buf,int bufsz, bool bReverse=false, bool bUseSysEncoding=false) const; 00247 00248 /*this is used by code that reads xml using expat*/ 00249 static int XAP_XML_UnknownEncodingHandler(void *encodingHandlerData, 00250 const gchar *name, 00251 XML_Encoding *info); 00252 00253 /*it's terminated with the record with NULL in language name. */ 00254 static const XAP_LangInfo langinfo[]; 00255 00256 /*it's terminated with a record with all NULLs. */ 00257 static const XAP_SmartQuoteStyle smartQuoteStyles[]; 00258 00259 /* 00260 Precise meaning: 00261 swap_utos: the following seq should produce a seq in buf that 00262 iconv will understand correctly when converting 00263 from UCS to mbs. 00264 unsigned short V; 00265 char buf[2]; 00266 b0 = V&0xff, b1 = V>>8; 00267 buf[swap_utos]=b0; 00268 buf[!swap_utos]=b1; 00269 swap_stou: the following seq should produce a correct value V 00270 that iconv will understand correctly when converting 00271 from mbs to UCS (i.e. return value). 00272 iconv(cd,&inptr,&inlen,&outptr,&outlen); 00273 unsigned short V; 00274 b0 = outptr_orig[swap_stou],b1 = outptr_orig[!swap_stou]; 00275 V = b0 | (b1<<8); 00276 */ 00277 static bool swap_utos; 00278 static bool swap_stou; 00279 00280 /* these are utility functions. Since all fields are strings, 00281 we can use the same routine. Returns NULL if nothing was found. */ 00282 static const XAP_LangInfo* findLangInfo(const char* key, 00283 XAP_LangInfo::fieldidx column); 00284 00285 static const XAP_LangInfo* findLangInfoByLocale(const char* locale); 00286 00287 /*word uses non-ascii names of fonts in .doc*/ 00288 static UT_Bijection cjk_word_fontname_mapping; 00289 /* CJK users need slightly different set of fontsizes*/ 00290 static UT_Bijection fontsizes_mapping; 00291 protected: 00292 void describe(); 00293 XAP_EncodingManager(); 00294 virtual ~XAP_EncodingManager(); 00295 00296 private: 00297 static XAP_EncodingManager* _instance; 00298 00299 const char* TexPrologue; 00300 UT_uint32 WinLanguageCode,WinCharsetCode; 00301 bool is_cjk_,m_bIsUnicodeLocale; 00302 }; 00303 00304 /* 00305 This one returns NULL-terminated vector of strings in static buffers (i.e. 00306 don't try to g_free anything). On next call, filled data will be lost. 00307 returns the following strings surrounded by prefix and suffix: 00308 if (!skip_fallback) 00309 ""; 00310 //next ones also include 'sep' to the left of them 00311 "%s" XAP_E..M..::instance->getLanguageISOName() 00312 "%s" XAP_E..M..::getNativeEncodingName() 00313 "%s-%s" XAP_E..M..::getLanguageISOName(),XAP_E..M..::getLanguageISOTerritory() 00314 "%s-%s.%s" XAP_E..M..::getLanguageISOName(), \ 00315 XAP_E..M..::getLanguageISOTerritory(), getNativeEncodingName() 00316 */ 00317 const char** localeinfo_combinations(const char* prefix,const char* suffix,const char* sep, bool skip_fallback=0); 00318 00319 /*these one are used by ispell*/ 00320 /* placate win32 compiler */ 00321 extern "C" { 00322 extern int XAP_EncodingManager__swap_stou,XAP_EncodingManager__swap_utos; 00323 const char * xap_encoding_manager_get_language_iso_name(void); 00324 } 00325 00326 #endif /* XAP_ENCMGR_H */