AbiWord: ut_string.h Source File

Go to the documentation of this file.
00001 /* AbiSource Program Utilities
00002  * Copyright (C) 1998,1999 AbiSource, Inc.
00003  *
00004  * This program is free software; you can redistribute it and/or
00005  * modify it under the terms of the GNU General Public License
00006  * as published by the Free Software Foundation; either version 2
00007  * of the License, or (at your option) any later version.
00008  *
00009  * This program is distributed in the hope that it will be useful,
00010  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00011  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00012  * GNU General Public License for more details.
00013  *
00014  * You should have received a copy of the GNU General Public License
00015  * along with this program; if not, write to the Free Software
00016  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00017  * 02110-1301 USA.
00018  */
00019 
00020 
00021 #ifndef UT_STRING_H
00022 #define UT_STRING_H
00023 
00024 #include <map>
00025 #include <string>
00026 #include <string.h>
00027 
00028 /* pre-emptive dismissal; ut_types.h is needed by just about everything,
00029  * so even if it's commented out in-file that's still a lot of work for
00030  * the preprocessor to do...
00031  */
00032 #ifndef UT_TYPES_H
00033 #include "ut_types.h"
00034 #endif
00035 
00036 class UT_GrowBuf;
00037 
00038 G_BEGIN_DECLS
00039 
00040 // this function allocates (and returns a pointer to) new memory for the new string
00041 ABI_EXPORT bool  UT_XML_cloneNoAmpersands(gchar *& rszDest, const gchar * szSource);
00042 // replaces &X -> _X; allocates buffer
00043 ABI_EXPORT bool  UT_XML_cloneConvAmpersands(gchar *& rszDest, const gchar * szSource);
00044 // This function uses a static buffer to do the translation
00045 ABI_EXPORT const gchar *  UT_XML_transNoAmpersands(const gchar * szSource);
00046 
00047 ABI_EXPORT void  UT_decodeUTF8string(const gchar * p, UT_uint32 len, UT_GrowBuf * pResult);
00048 
00051 ABI_EXPORT bool  UT_ensureValidXML(std::string & s);
00052 ABI_EXPORT bool  UT_isValidXML(const char *s);
00053 ABI_EXPORT bool  UT_validXML(char * s);
00054 
00055 /* ABI_EXPORT gchar *  UT_decodeXMLstring(gchar *pcIn);
00056  * This has moved to ut_xml.cpp as UT_XML::decode ()
00057  */
00058 
00059 ABI_EXPORT bool  UT_isSmartQuotableCharacter(UT_UCSChar c);
00060 ABI_EXPORT bool  UT_isSmartQuotedCharacter(UT_UCSChar c);
00061 
00063 //
00064 //  UCS-2 string (UT_UCS2Char)
00065 //
00066 //  String is built of 16-bit units (words)
00067 //
00068 //  TODO: Is this really UCS-2 or UTF-16?
00069 //  TODO:  meaning, does it support surrogates or is it intended to
00070 //  TODO:  support them at any time in the future?
00071 //  TODO: Correctly, UCS-2 does not support surrogates and UTF-16 does.
00072 //  TODO: BUT Microsoft calls their native Unicode encoding UCS-2
00073 //  TODO:  while it supports surrogates and is thus really UTF-16.
00074 //  TODO: Surrogates are Unicode characters with codepoints above
00075 //  TODO:  65535 which cannot therefore fit into a 2-byte word.
00076 //  TODO: This means that TRUE UCS-2 is a single-word encoding and
00077 //  TODO:  UTF-16 is a multi-word encoding.
00078 //
00079 //  NOTE: We shouldn't actually need 16-bit strings anymore since
00080 //  NOTE:  AbiWord is now fully converted to using 32-bit Unicode
00081 //  NOTE:  internally. The only possible needs for this is for
00082 //  NOTE:  Windows GUI, filesystem and API functions where applicable;
00083 //  NOTE:  and perhaps some file formats or external libraries
00084 //
00086 
00087 #ifdef ENABLE_UCS2_STRINGS
00088 
00089 #define UT_UCS2_isdigit(x)  (((x) >= '0') && ((x) <= '9'))  // TODO: make UNICODE-wise
00090 
00091 /*these are unicode-safe*/
00092 ABI_EXPORT bool  UT_UCS2_isupper(UT_UCS2Char c);
00093 ABI_EXPORT bool  UT_UCS2_islower(UT_UCS2Char c);
00094 ABI_EXPORT bool  UT_UCS2_isalpha(UT_UCS2Char c);
00095 ABI_EXPORT bool  UT_UCS2_isSentenceSeparator(UT_UCS2Char c);
00096 #define UT_UCS2_isalnum(x)  (UT_UCS2_isalpha(x) || UT_UCS2_isdigit(x)) // HACK: not UNICODE-safe
00097 ABI_EXPORT bool UT_UCS2_isspace(UT_UCS2Char c);
00098 #define UT_UCS2_ispunct(x)   ((!UT_UCS2_isspace(x)  &&  !UT_UCS2_isalnum(x)  &&  (x)>' '))  // HACK: not UNICODE safe
00099 
00100 // the naming convention has deviated from the above.  it's kind
00101 // of a mutant libc/C++ naming convention.
00102 ABI_EXPORT UT_UCS2Char *     UT_UCS2_strstr(const UT_UCS2Char * phaystack, const UT_UCS2Char * pneedle);
00103 ABI_EXPORT UT_sint32         UT_UCS2_strcmp(const UT_UCS2Char* left, const UT_UCS2Char* right);
00104 ABI_EXPORT UT_UCS2Char *     UT_UCS2_stristr(const UT_UCS2Char * phaystack, const UT_UCS2Char * pneedle);
00105 ABI_EXPORT UT_UCS2Char *     UT_UCS2_strcpy(UT_UCS2Char * dest, const UT_UCS2Char * src);
00106 ABI_EXPORT UT_UCS2Char *     UT_UCS2_strcpy_char(UT_UCS2Char * dest, const char * src);
00107 ABI_EXPORT char *            UT_UCS2_strcpy_to_char(char * dest, const UT_UCS2Char * src);
00108 ABI_EXPORT bool          UT_UCS2_cloneString(UT_UCS2Char ** dest, const UT_UCS2Char * src);
00109 ABI_EXPORT bool          UT_UCS2_cloneString_char(UT_UCS2Char ** dest, const char * src);
00110 ABI_EXPORT UT_UCS2Char *     UT_UCS2_strncpy(UT_UCS2Char * dest, const UT_UCS2Char * src, UT_uint32 n);
00111 ABI_EXPORT UT_UCS2Char *     UT_UCS2_strnrev(UT_UCS2Char * dest, UT_uint32 n);
00112 
00113 ABI_EXPORT UT_UCS2Char       UT_UCS2_tolower(UT_UCS2Char c);
00114 ABI_EXPORT UT_UCS2Char       UT_UCS2_toupper(UT_UCS2Char c);
00115 
00116 #endif
00117 
00118 // Don't ifdef this one out since MSWord importer uses it
00119 
00120 ABI_EXPORT UT_uint32         UT_UCS2_strlen(const UT_UCS2Char * string);
00121 
00123 //
00124 //  UCS-4 string (UT_UCS4Char)
00125 //
00126 //  String is built of 32-bit units (longs)
00127 //
00128 //  NOTE: Ambiguity between UCS-2 and UTF-16 above makes no difference
00129 //  NOTE:  in the case of UCS-4 and UTF-32 since they really are
00130 //  NOTE:  identical
00131 //
00133 
00134 /*these are unicode-safe*/
00135 ABI_EXPORT bool  UT_UCS4_isupper(UT_UCS4Char c);
00136 ABI_EXPORT bool  UT_UCS4_islower(UT_UCS4Char c);
00137 ABI_EXPORT bool  UT_UCS4_isalpha(UT_UCS4Char c);
00138 ABI_EXPORT bool  UT_UCS4_isSentenceSeparator(UT_UCS4Char c);
00139 ABI_EXPORT bool  UT_UCS4_isdigit(UT_UCS4Char c);
00140 #define UT_UCS4_isalnum(x)  (UT_UCS4_isalpha(x) || UT_UCS4_isdigit(x)) // HACK: not UNICODE-safe
00141 ABI_EXPORT bool UT_UCS4_isspace(UT_UCS4Char c);
00142 #define UT_UCS4_ispunct(x)   ((!UT_UCS4_isspace(x)  &&  !UT_UCS4_isalnum(x)  &&  (x)>' '))  // HACK: not UNICODE safe
00143 
00144 // the naming convention has deviated from the above.  it's kind
00145 // of a mutant libc/C++ naming convention.
00146 ABI_EXPORT UT_sint32         UT_UCS4_strcmp(const UT_UCS4Char* left, const UT_UCS4Char* right);
00147 ABI_EXPORT UT_UCS4Char *     UT_UCS4_strstr(const UT_UCS4Char * phaystack, const UT_UCS4Char * pneedle);
00148 ABI_EXPORT UT_UCS4Char *     UT_UCS4_stristr(const UT_UCS4Char * phaystack, const UT_UCS4Char * pneedle);
00149 ABI_EXPORT UT_uint32         UT_UCS4_strlen(const UT_UCS4Char * string);
00150 ABI_EXPORT UT_uint32         UT_UCS4_strlen_as_char(const UT_UCS4Char * string);
00151 ABI_EXPORT UT_UCS4Char *     UT_UCS4_strcpy(UT_UCS4Char * dest, const UT_UCS4Char * src);
00152 ABI_EXPORT UT_UCS4Char *     UT_UCS4_strcpy_char(UT_UCS4Char * dest, const char * src);
00153 ABI_EXPORT UT_UCS4Char *     UT_UCS4_strncpy_char(UT_UCS4Char * dest, const char * src, int);
00154 ABI_EXPORT UT_UCS4Char *     UT_UCS4_strcpy_utf8_char(UT_UCS4Char * dest, const char * src);
00155 ABI_EXPORT char *            UT_UCS4_strcpy_to_char(char * dest, const UT_UCS4Char * src);
00156 ABI_EXPORT char *            UT_UCS4_strncpy_to_char(char * dest, const UT_UCS4Char * src, int);
00157 ABI_EXPORT bool              UT_UCS4_cloneString(UT_UCS4Char ** dest, const UT_UCS4Char * src);
00158 ABI_EXPORT bool              UT_UCS4_cloneString_char(UT_UCS4Char ** dest, const char * src);
00159 ABI_EXPORT UT_UCS4Char *     UT_UCS4_strncpy(UT_UCS4Char * dest, const UT_UCS4Char * src, UT_uint32 n);
00160 ABI_EXPORT UT_UCS4Char *     UT_UCS4_strnrev(UT_UCS4Char * dest, UT_uint32 n);
00161 
00162 ABI_EXPORT UT_UCS4Char       UT_UCS4_tolower(UT_UCS4Char c);
00163 ABI_EXPORT UT_UCS4Char       UT_UCS4_toupper(UT_UCS4Char c);
00164 
00165 
00166 ABI_EXPORT void UT_parse_attributes(const char * attributes,
00167                                     std::map<std::string, std::string> & map);
00168 ABI_EXPORT void UT_parse_properties(const char * props,
00169                                     std::map<std::string, std::string> & map);
00170 
00171 // implemented in UT_strptime.cpp - see strptime() as it is not avail on win.
00172 #ifdef _WIN32
00173 
00174 extern "C" {
00175 ABI_EXPORT char *UT_strptime (const char *buf, const char *format, struct tm *tm);
00176 }
00177 
00178 #else
00179 
00180 #define UT_strptime strptime
00181 
00182 #endif
00183 
00184 
00185 #ifdef _WIN32
00186 #define snprintf _snprintf
00187 
00188 #define _(String) (String)
00189 #define N_(String) (String)
00190 
00191 #endif /* WIN32 */
00192 
00193 #if defined (SNPRINTF_MISSING)
00194   extern int snprintf(char *str, size_t size, const  char  *format, ...);
00195 #endif
00196 
00197 /*
00198  this one prints floating point value but using dot as fractional serparator
00199  independent of the current locale's settings.
00200 */
00201 ABI_EXPORT const char*  std_size_string(float f);
00202 
00203 
00204 #include <fribidi.h>
00205 
00206 typedef FriBidiCharType UT_BidiCharType;
00207 
00208 #define UT_BIDI_LTR FRIBIDI_TYPE_LTR
00209 #define UT_BIDI_RTL FRIBIDI_TYPE_RTL
00210 #define UT_BIDI_WS FRIBIDI_TYPE_WS
00211 #define UT_BIDI_EN FRIBIDI_TYPE_EN
00212 #define UT_BIDI_ES FRIBIDI_TYPE_ES
00213 #define UT_BIDI_ET FRIBIDI_TYPE_ET
00214 #define UT_BIDI_AN FRIBIDI_TYPE_AN
00215 #define UT_BIDI_CS FRIBIDI_TYPE_CS
00216 #define UT_BIDI_BS FRIBIDI_TYPE_BS
00217 #define UT_BIDI_SS FRIBIDI_TYPE_SS
00218 #define UT_BIDI_AL FRIBIDI_TYPE_AL
00219 #define UT_BIDI_NSM FRIBIDI_TYPE_NSM
00220 #define UT_BIDI_RLE FRIBIDI_TYPE_RLE
00221 #define UT_BIDI_LRE FRIBIDI_TYPE_LRE
00222 #define UT_BIDI_LRO FRIBIDI_TYPE_LRO
00223 #define UT_BIDI_RLO FRIBIDI_TYPE_RLO
00224 #define UT_BIDI_PDF FRIBIDI_TYPE_PDF
00225 #define UT_BIDI_ON FRIBIDI_TYPE_ON
00226 
00227 
00228 #define UT_BIDI_UNSET FRIBIDI_TYPE_UNSET
00229 #define UT_BIDI_IGNORE FRIBIDI_TYPE_IGNORE
00230 
00231 #define UT_BIDI_IS_STRONG FRIBIDI_IS_STRONG
00232 #define UT_BIDI_IS_WEAK FRIBIDI_IS_WEAK
00233 #define UT_BIDI_IS_NUMBER FRIBIDI_IS_NUMBER
00234 #define UT_BIDI_IS_RTL FRIBIDI_IS_RTL
00235 #define UT_BIDI_IS_NEUTRAL FRIBIDI_IS_NEUTRAL
00236 #define UT_BIDI_IS_LETTER FRIBIDI_IS_LETTER
00237 #define UT_BIDI_IS_NSM(x) ((x) & FRIBIDI_MASK_NSM)
00238 
00239 
00240 ABI_EXPORT UT_BidiCharType UT_bidiGetCharType(UT_UCS4Char c);
00241 
00242 ABI_EXPORT bool            UT_bidiMapLog2Vis(const UT_UCS4Char * pStrIn, UT_uint32 len, UT_BidiCharType baseDir,
00243                                              UT_uint32 *pL2V, UT_uint32 * pV2L, UT_Byte * pEmbed);
00244 
00245 ABI_EXPORT bool            UT_bidiReorderString(const UT_UCS4Char * pStrIn, UT_uint32 len, UT_BidiCharType baseDir,
00246                                                 UT_UCS4Char * pStrOut);
00247 
00248 
00249 ABI_EXPORT bool            UT_bidiGetMirrorChar(UT_UCS4Char c, UT_UCS4Char &mc);
00250 
00251 G_END_DECLS
00252 
00253 #endif /* UT_STRING_H */