AbiWord: xap_EncodingManager.h Source File

Go to the documentation of this file.
00001 /* AbiSource Application Framework
00002  * Copyright (C) 2000
00003  * Orignially by Vlad Harchev <hvv@hippo.ru>
00004  *
00005  * This program is free software; you can redistribute it and/or
00006  * modify it under the terms of the GNU General Public License
00007  * as published by the Free Software Foundation; either version 2
00008  * of the License, or (at your option) any later version.
00009  *
00010  * This program is distributed in the hope that it will be useful,
00011  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00012  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00013  * GNU General Public License for more details.
00014  *
00015  * You should have received a copy of the GNU General Public License
00016  * along with this program; if not, write to the Free Software
00017  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00018  * 02110-1301 USA.
00019  */
00020 
00021 #ifndef XAP_ENCMGR_H
00022 #define XAP_ENCMGR_H
00023 
00024 /* pre-emptive dismissal; ut_types.h is needed by just about everything,
00025  * so even if it's commented out in-file that's still a lot of work for
00026  * the preprocessor to do...
00027  */
00028 #ifndef UT_TYPES_H
00029 #include "ut_types.h"
00030 #endif
00031 
00032 #ifdef HAVE_EXPAT
00033 #ifndef gchar
00034 typedef gchar gchar;
00035 #endif
00036 #include <expat.h>
00037 #else
00038 #define XML_Encoding void
00039 #endif
00040 
00041 #include "ut_bijection.h"
00042 #include "ut_iconv.h"
00043 #include "ut_xml.h"
00044 
00045 struct ABI_EXPORT XAP_LangInfo
00046 {
00047     /*no memeber can have NULL value. If string is empty, then value is
00048      not defined. All fields are strings to simplify searches.
00049     */
00050     enum fieldidx { longname_idx, /*this field is not empty*/
00051             isoshortname_idx /*ISO*/,
00052             countrycode_idx, /*e.g. the "US" in "en-US" */
00053             winlangcode_idx, /*0x400 + atoi() it to get a value*/
00054             macname_idx, /*e.g. "langRussian" or empty*/
00055             maclangcode_idx, /*atoi() it to get a value*/
00056             max_idx = maclangcode_idx };
00057 
00058     const char*     fields[max_idx+1];
00059 
00060     size_t outerQuoteIdx;
00061     size_t innerQuoteIdx;
00062 };
00063 
00064 struct ABI_EXPORT XAP_SmartQuoteStyle
00065 {
00066     UT_UCSChar leftQuote;
00067     UT_UCSChar rightQuote;
00068 };
00069 
00070 
00071 class ABI_EXPORT XAP_EncodingManager
00072 {
00073 public:
00074     /*
00075     These shouldn't return NULL. Don't g_free or write to returned strings.
00076     The strings should be uppercased (extra font tarballs assume this).
00077     */
00078     virtual const char* getNativeEncodingName() const;
00079     virtual const char* getNativeSystemEncodingName() const;
00080     virtual const char* getNative8BitEncodingName() const;
00081     virtual const char* getNativeNonUnicodeEncodingName() const;
00082 
00083     /*
00084     These can return NULL. Don't g_free or write to returned strings.
00085     The strings should be uppercased (extra font tarballs assume this).
00086     */
00087     virtual const char* getNativeUnicodeEncodingName() const;
00088     virtual const char* getUCS2BEName() const;
00089     virtual const char* getUCS2LEName() const;
00090     virtual const char* getUCS4BEName() const;
00091     virtual const char* getUCS4LEName() const;
00092 
00093     /*
00094     This should return true for any Unicode locale:
00095     UTF-8 on *nix, UCS-2 on Windows, etc
00096     */
00097     inline virtual bool isUnicodeLocale() const {return m_bIsUnicodeLocale;}
00098 
00099     /*
00100     This shouldn't return NULL. Don't g_free or write to returned string.
00101     Returns ISO two-letter name like "en"
00102     */
00103     virtual const char* getLanguageISOName() const;
00104 
00105     /*
00106     This can return NULL. Don't g_free or write to returned string.
00107     Returns ISO two-letter territory name like "UK".
00108     */
00109 
00110     virtual const char* getLanguageISOTerritory() const;
00111 
00112 #if 0
00113     /*
00114     for exporting to Tex - in order to provide proper argument for
00115     {inputenc}, e.g. \usepackage[koi8-r]{inputenc}
00116     If NULL is returned, then package 'inputenc' is not used at all.
00117     */
00118     virtual const char* getNativeTexEncodingName() const;
00119 #else
00120     virtual void placeholder() {};//to be removed
00121 #endif
00122     /*
00123     Should return "\n"-terminated prologue that loads required packages,
00124     etc.
00125     */
00126     virtual const char* getTexPrologue() const;
00127 
00128     /*these return 0 if they can't convert*/
00129     /*
00130     This won't work for c>0xff. Use UT_Mbtowc!
00131     */
00132     virtual UT_UCSChar try_nativeToU(UT_UCSChar c) const;
00133     /*
00134     If returned value is > 0xff, then multibyte seq is returned by iconv
00135     and return value mean nothing (except that it notes that singlebyte
00136     encoding can't be used for this character.
00137     */
00138     virtual UT_UCSChar try_UToNative(UT_UCSChar c)  const;
00139     virtual UT_UCSChar try_UToLatin1(UT_UCSChar c)  const;
00140 
00141     /*these are used for reading/writing of doc and rtf files. */
00142     virtual UT_UCSChar try_WindowsToU(UT_UCSChar c) const;
00143     virtual UT_UCSChar try_UToWindows(UT_UCSChar c)  const;
00144 
00145 
00146 
00147 
00148     virtual char fallbackChar(UT_UCSChar c) const;
00149     static XAP_EncodingManager *get_instance();
00150     void Delete_instance();
00151 
00152     /*  This tries to approximate the character with the string, e.g.
00153     horizontal-elipsis -> "...". Returns # of chars written or 0 if can't.
00154     The returned string will be in ascii (i.e. it will be representable
00155     in any encoding).
00156     If 'max_length' is 1, then approrixmation with exactly one character is
00157     requested. If 'max_length' is not 1, then it's rather large (e.g. 16) -
00158     so there is no need to check whether there is enough g_free space in the
00159     buffer.
00160     */
00161     virtual UT_uint32  approximate(char* out,UT_uint32 max_length,UT_UCSChar c) const;
00162 
00163     /*
00164        This should return 0 if it's unknown. Used only when exporting the
00165        document.
00166     */
00167     virtual UT_uint32  getWinLanguageCode() const;
00168 
00169        /*
00170       0 means Ascii. See _CHARSET macros (e.g RUSSIAN_CHARSET) in wingdi.h
00171       from Win32 SDK.   Used only when exporting RTF.
00172         */
00173     virtual UT_uint32  getWinCharsetCode() const;
00174 
00175     /*can be called several times - e.g. by constructor of port-specific
00176      implementation. */
00177     virtual void initialize();
00178     /*
00179         returns 1 if current langauge is CJK (chinese, japanese, korean)
00180     */
00181     inline virtual bool cjk_locale() const { return is_cjk_; }
00182 
00183     /*
00184         returns true if there is no distinction between upper and lower
00185         letters.
00186     */
00187     virtual bool single_case() const;
00188 
00189     /*
00190         returns true if all letters are non-CJK. Under non-cjk locales
00191         it returns 1. Under cjk locales, returns 1 if all chars <0xff
00192         in that range.
00193     */
00194     virtual bool noncjk_letters(const UT_UCSChar* str,int len) const;
00195 
00196     /*
00197      * Returns true if a break between c[0] and c[1] is permissible.
00198      */
00199     virtual bool canBreakBetween(const UT_UCS4Char c[2]) const;
00200 
00201     /*
00202         This should be as precise as possible.
00203     */
00204     virtual bool is_cjk_letter(UT_UCSChar c) const;
00205 
00206     /*
00207         This is rather smart wrapper for wvLIDToCodePageConverter.
00208         Not all CP* are known by current iconv's, so this function
00209         will try first to return charset string that iconv knows.
00210     */
00211     virtual const char* charsetFromCodepage(int lid) const;
00212 
00213     /*
00214         This is convert charset to codepage.
00215     */
00216     virtual const char* CodepageFromCharset(const char *charset) const;
00217 
00218     /*
00219         returns charsetFromCodepage( getWinLanguageCode() )
00220     */
00221     virtual const char* WindowsCharsetName() const;
00222 
00223         /* these use try_ methods, and if they fail, fallbackChar() is returned*/
00224     UT_UCSChar nativeToU(UT_UCSChar c) const;
00225     UT_UCSChar UToNative(UT_UCSChar c)  const;
00226     UT_UCSChar WindowsToU(UT_UCSChar c) const;
00227     UT_UCSChar UToWindows(UT_UCSChar c)  const;
00228 
00229     /*
00230       this will convert the string 'in' from charset 'charset' to
00231       native charset. This is mostly shorthand function.
00232       it returns ptr to translated string - it will be either
00233       'in' or will be in static storage that shouldn't be freed.
00234       Allowed values for 'charset' include "" and NULL in which case
00235       'in' be returned. Don't ask to translate strings longer than 2K.
00236           This code is mostly used for translating localized menuitems to
00237       the proper charset.
00238 
00239       Of course it uses iconv internally.
00240     */
00241     const char* strToNative(const char* in,const char* charset,bool bReverse=false, bool bUseSysEncoding=false) const;
00242 
00243     /*
00244      Same as above, but it will use buffer provided by caller.
00245     */
00246     const char* strToNative(const char* in,const char* charset,char* buf,int bufsz, bool bReverse=false, bool bUseSysEncoding=false) const;
00247 
00248     /*this is used by code that reads xml using expat*/
00249     static int XAP_XML_UnknownEncodingHandler(void *encodingHandlerData,
00250                                           const gchar *name,
00251                                           XML_Encoding *info);
00252 
00253     /*it's terminated with the record with NULL in language name. */
00254     static const XAP_LangInfo       langinfo[];
00255 
00256     /*it's terminated with a record with all NULLs. */
00257     static const XAP_SmartQuoteStyle        smartQuoteStyles[];
00258 
00259     /*
00260         Precise meaning:
00261         swap_utos: the following seq should produce a seq in buf that
00262                 iconv will understand correctly when converting
00263                 from UCS to mbs.
00264             unsigned short V;
00265             char buf[2];
00266             b0 = V&0xff, b1 = V>>8;
00267             buf[swap_utos]=b0;
00268             buf[!swap_utos]=b1;
00269         swap_stou: the following seq should produce a correct value V
00270             that iconv will understand correctly when converting
00271                 from mbs to UCS (i.e. return value).
00272             iconv(cd,&inptr,&inlen,&outptr,&outlen);
00273             unsigned short V;
00274             b0 = outptr_orig[swap_stou],b1 = outptr_orig[!swap_stou];
00275             V = b0 | (b1<<8);
00276     */
00277     static bool swap_utos;
00278     static bool swap_stou;
00279 
00280     /* these are utility functions. Since all fields are strings,
00281     we can use the same routine. Returns NULL if nothing was found. */
00282     static const XAP_LangInfo* findLangInfo(const char* key,
00283         XAP_LangInfo::fieldidx column);
00284 
00285     static const XAP_LangInfo* findLangInfoByLocale(const char* locale);
00286 
00287     /*word uses non-ascii names of fonts in .doc*/
00288     static UT_Bijection cjk_word_fontname_mapping;
00289         /* CJK users need slightly different set of fontsizes*/
00290     static UT_Bijection fontsizes_mapping;
00291 protected:
00292     void describe();
00293     XAP_EncodingManager();
00294     virtual ~XAP_EncodingManager();
00295 
00296 private:
00297     static XAP_EncodingManager*     _instance;
00298 
00299     const char* TexPrologue;
00300     UT_uint32 WinLanguageCode,WinCharsetCode;
00301     bool is_cjk_,m_bIsUnicodeLocale;
00302 };
00303 
00304 /*
00305     This one returns NULL-terminated vector of strings in static buffers (i.e.
00306     don't try to g_free anything). On next call, filled data will be lost.
00307     returns the following strings surrounded by prefix and suffix:
00308     if (!skip_fallback)
00309     "";
00310     //next ones also include 'sep' to the left of them
00311     "%s"    XAP_E..M..::instance->getLanguageISOName()
00312     "%s"    XAP_E..M..::getNativeEncodingName()
00313     "%s-%s" XAP_E..M..::getLanguageISOName(),XAP_E..M..::getLanguageISOTerritory()
00314     "%s-%s.%s"  XAP_E..M..::getLanguageISOName(), \
00315         XAP_E..M..::getLanguageISOTerritory(), getNativeEncodingName()
00316 */
00317 const char** localeinfo_combinations(const char* prefix,const char* suffix,const char* sep, bool skip_fallback=0);
00318 
00319 /*these one are used by ispell*/
00320 /* placate win32 compiler */
00321 extern "C" {
00322 extern int XAP_EncodingManager__swap_stou,XAP_EncodingManager__swap_utos;
00323 const char * xap_encoding_manager_get_language_iso_name(void);
00324 }
00325 
00326 #endif /* XAP_ENCMGR_H */