• Main Page
  • Related Pages
  • Namespaces
  • Classes
  • Files
  • File List
  • File Members

ut_string_class.h

Go to the documentation of this file.
00001 // ut_string_class.h
00002 //
00003 // A simple string class for use where templates are not
00004 // allowed.
00005 //
00006 #ifndef UT_STRING_CLASS_H
00007 #define UT_STRING_CLASS_H
00008 
00009 //
00010 // Copyright (C) 2001 Mike Nordell <tamlin@algonet.se>
00011 // Copyright (C) 2001 Dom Lachowicz <dominicl@seas.upenn.edu>
00012 // Copyright (C) 2002 Tomas Frydrych <tomas@frydrych.uklinux.net>
00013 //
00014 // This class is free software; you can redistribute it and/or
00015 // modify it under the terms of the GNU General Public License
00016 // as published by the Free Software Foundation; either version 2
00017 // of the License, or (at your option) any later version.
00018 //
00019 // This class is distributed in the hope that it will be useful,
00020 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00021 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00022 // GNU General Public License for more details.
00023 //
00024 // You should have received a copy of the GNU General Public License
00025 // along with this program; if not, write to the Free Software
00026 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
00027 // 02110-1301 USA.
00028 //
00029 
00030 #include <stdlib.h>
00031 #include <stdarg.h>
00032 
00033 #if defined(__MINGW32__)
00034 #  undef snprintf
00035 #  if __GNUC__ <= 3
00036 #    define _GLIBCXX_USE_C99_DYNAMIC 1
00037 #  endif
00038 #endif
00039 
00040 #include <string>
00041 
00042 /* pre-emptive dismissal; ut_types.h is needed by just about everything,
00043  * so even if it's commented out in-file that's still a lot of work for
00044  * the preprocessor to do...
00045  */
00046 #ifndef UT_TYPES_H
00047 #include "ut_types.h"
00048 #endif
00049 #include "ut_string.h"
00050 #include "ut_stringbuf.h"
00051 #include "ut_bytebuf.h"
00052 
00053 // Forward declarations
00054 class UT_UCS4_mbtowc;
00055 class UT_String;
00056 class UT_UTF8String;
00057 class UT_UCS4String;
00058 
00059 
00060 // yes, this is screaming for a template
00061 
00063 //
00064 //  8-bit string
00065 //
00066 //  String is built of 8-bit units (bytes)
00067 //  Encoding could be any single-byte or multi-byte encoding
00068 //
00070 
00072 //  UT_String, a simple wrapper for zero terminated 'char' strings.
00073 //
00074 class ABI_EXPORT UT_String
00075 {
00076 public:
00077     UT_String();
00078     UT_String(const char* sz, size_t n = 0 /* 0 == zero-terminate */);
00079     UT_String(const UT_String& rhs);
00080     UT_String(const std::basic_string<char> &s);
00081     ~UT_String();
00082 
00083     size_t      size() const;
00084     size_t length () const { return size () ; }
00085     void            reserve(size_t n);
00086     bool        empty() const;
00087     void        clear() const;
00088 
00089     UT_String   substr(size_t iStart, size_t nChars) const;
00090 
00091     UT_String&  operator=(const UT_String& rhs);
00092     UT_String&  operator=(const char*      rhs);
00093     UT_String&  operator=(const std::basic_string<char> & rhs);
00094     UT_String&  operator+=(const UT_String& rhs);
00095     UT_String&  operator+=(const char*      rhs);
00096     UT_String&  operator+=(char rhs);
00097 
00098     char        operator[](size_t iPos) const;
00099     char&       operator[](size_t iPos);
00100 
00101     void        swap(UT_String& rhs);
00102 
00103     // The returned pointer is valid until the next non-const
00104     // operation. You will _always_ get a legal pointer back,
00105     // even if to an empty string.
00106     const char* c_str() const;
00107 
00108 private:
00109     class UT_StringImpl<char>* pimpl;
00110 };
00111 
00112 // helpers
00113 ABI_EXPORT bool operator==(const UT_String& s1, const UT_String& s2);
00114 ABI_EXPORT bool operator==(const UT_String& s1, const char*      s2);
00115 ABI_EXPORT bool operator==(const char*      s1, const UT_String& s2);
00116 ABI_EXPORT bool operator!=(const UT_String& s1, const UT_String& s2);
00117 ABI_EXPORT bool operator!=(const UT_String& s1, const char*      s2);
00118 ABI_EXPORT bool operator!=(const char*      s1, const UT_String& s2);
00119 
00120 ABI_EXPORT UT_uint32 hashcode(const UT_String& string);
00121 ABI_EXPORT UT_uint32 hashcode(const char *s);
00122 
00123 // strcmp ordering
00124 ABI_EXPORT bool operator<(const UT_String& s1, const UT_String& s2);
00125 
00126 ABI_EXPORT UT_String operator+(const UT_String& s1, const UT_String& s2);
00127 
00128 ABI_EXPORT size_t UT_String_findCh(const UT_String &st, char ch);
00129 ABI_EXPORT size_t UT_String_findRCh(const UT_String &st, char ch);
00130 
00131 /****************************************************************************/
00132 
00137 ABI_EXPORT UT_String& UT_String_sprintf(UT_String & inStr, const char * inFormat, ...) ABI_PRINTF_FORMAT(2,3);
00138 ABI_EXPORT UT_String& UT_String_vprintf (UT_String & inStr, const char *format,
00139                                          va_list      args1)
00140     ABI_PRINTF_FORMAT(2,0);
00141 ABI_EXPORT UT_String& UT_String_vprintf (UT_String & inStr, const UT_String & format,
00142                      va_list      args1);
00143 
00148 ABI_EXPORT UT_String UT_String_sprintf(const char * inFormat, ...)
00149     ABI_PRINTF_FORMAT(1,2);
00150 ABI_EXPORT UT_String UT_String_vprintf(const char * inFormat, va_list args1)
00151     ABI_PRINTF_FORMAT(1,0);
00152 ABI_EXPORT UT_String UT_String_vprintf(const UT_String & inFormat, va_list args1);
00153 
00154 /***************************************************************************/
00155 
00156 /***************************************************************************/
00161 ABI_EXPORT UT_String UT_String_getPropVal(const UT_String & sPropertyString, const UT_String & sProp);
00162 ABI_EXPORT void UT_String_removeProperty(UT_String & sPropertyString, const UT_String & sProp);
00163 ABI_EXPORT void UT_String_setProperty(UT_String & sPropertyString, const UT_String &sProp, const UT_String & sVal);
00164 ABI_EXPORT void UT_String_addPropertyString(UT_String & sPropertyString, const UT_String & sNewProp);
00165 
00167 //
00168 //  UTF-8 string: encoding is *always* UTF-8
00169 //
00171 
00173 //  UT_UTF8String, a simple wrapper for zero terminated 'UTF-8' strings.
00174 //
00175 
00176 class ABI_EXPORT UT_UTF8String
00177 {
00178 public:
00179     UT_UTF8String ();
00180     UT_UTF8String (const char * sz, size_t n = 0 /* 0 == null-termination */);
00181     UT_UTF8String (const char *sz, const char *encoding);
00182 
00183     UT_UTF8String (const UT_UTF8String & rhs);
00184     UT_UTF8String (const UT_UCS4String & rhs);
00185     UT_UTF8String (const UT_UCSChar * sz, size_t n = 0 /* 0 == zero-terminate */);
00186 
00187     ~UT_UTF8String ();
00188 
00189     size_t      size () const;
00190     size_t length () const { return size () ; }
00191 
00192     void            reserve(size_t n);
00193     bool        empty () const;
00194     void        clear () const;
00195     size_t      byteLength() const;
00196     void        dump(void) const;
00197     UT_UTF8String   substr(size_t iStart, size_t nChars) const;
00198 
00199     UT_UTF8String & operator=(const char *          rhs);
00200     UT_UTF8String & operator=(const std::string &   rhs);
00201     UT_UTF8String & operator=(const UT_UTF8String & rhs);
00202     UT_UTF8String & operator=(const UT_UCS4String & rhs);
00203 
00204     UT_UTF8String & operator+=(const UT_UCS4Char     rhs);
00205     UT_UTF8String & operator+=(const char *          rhs);
00206     UT_UTF8String & operator+=(const std::string &   rhs);
00207     UT_UTF8String & operator+=(const UT_UTF8String & rhs);
00208     UT_UTF8String & operator+=(const UT_UCS4String & rhs);
00209 
00210     // The returned pointer is valid until the next non-const
00211     // operation. You will _always_ get a legal pointer back,
00212     // even if to an empty (0) string.
00213     const char * utf8_str () const;
00214     UT_UCS4String ucs4_str ();
00215 
00216     void        assign (const char * sz, size_t n = 0 /* 0 == null-termination */);
00217     void        append (const char * sz, size_t n = 0 /* 0 == null-termination */);
00218     void        appendBuf(const UT_ConstByteBufPtr & buf, UT_UCS4_mbtowc & converter);
00219 
00220     void        appendUCS4 (const UT_UCS4Char * sz, size_t n = 0 /* 0 == null-termination */);
00221     void        appendUCS2 (const UT_UCS2Char * sz, size_t n = 0 /* 0 == null-termination */);
00222 
00223     const UT_UTF8String & escape (const UT_UTF8String & str1,
00224                       const UT_UTF8String & str2);  // replaces <str1> with <str2> in the current string
00225     const UT_UTF8String & escapeXML ();  // escapes '<', '>', '"', & '&' in the current string
00226     const UT_UTF8String & decodeXML ();  // unescapes '<', '>', '"', & '&' in the current string
00227     const UT_UTF8String & escapeMIME (); // translates the current string to MIME "quoted-printable" format
00228     const UT_UTF8String & escapeURL ();  // make URL confirm to RFC 1738
00229     const UT_UTF8String & decodeURL ();
00230 
00231     /* UTF8String - NOTES
00232      *
00233      * TODO:
00234      * 1. Maybe have a search&replace function, something like:
00235      *
00236      *  int replace (const char * utf_newstr, const char * utf_oldstr);
00237      *
00238      *    which could be used to do substitutions, e.g.:
00239      *
00240      *  UTF8String xmlstr = "expr: if ((c > 0) && (c < 0x80)) return c;";
00241      *  xmlstr.replace ("&lt;", "<");
00242      *  xmlstr.replace ("&gt;", ">");
00243      *  xmlstr.replace ("&amp;","&");
00244      *
00245      *  MIQ: Note that for these replace methods, one might use ut_std_string/replace_all()
00246      *
00247      *
00248      * getIterator:
00249      * returns a home-made iterator associated with the UTF-8 string, e.g.:
00250      *
00251      *  UTF8String str = "This is a UTF-8 string.";
00252      *  UT_UTF8Stringbuf::UTF8Iterator & iter = str.getIterator ();
00253      *  iter = iter.start (); // iter.start() returns 0 if no string, so:
00254      *  if (iter.current ())
00255      *  {
00256      *      while (true)
00257      *      {
00258      *          char * pUTF = iter.current ();
00259      *          if (*pUTF == 0) break; // end-of-string
00260      *          // etc.
00261      *          iter.advance (); // or ++iter;
00262      *      }
00263      *  }
00264      *
00265      * The iterator will be well behaved provided the string is not being edited.
00266      */
00267     UT_UTF8Stringbuf::UTF8Iterator getIterator () const
00268     {
00269         return UT_UTF8Stringbuf::UTF8Iterator(pimpl);
00270     }
00271 
00272 private:
00273     class UT_UTF8Stringbuf * pimpl;
00274 };
00275 
00276 ABI_EXPORT bool operator<(const UT_UTF8String& s1, const UT_UTF8String& s2);
00277 ABI_EXPORT bool operator==(const UT_UTF8String& s1, const UT_UTF8String& s2);
00278 ABI_EXPORT bool operator!=(const UT_UTF8String& s1, const UT_UTF8String& s2);
00279 ABI_EXPORT bool operator==(const UT_UTF8String& s1, const char * s2);
00280 ABI_EXPORT bool operator!=(const UT_UTF8String& s1, const char * s2);
00281 ABI_EXPORT bool operator==(const UT_UTF8String& s1, const std::string & s2);
00282 ABI_EXPORT bool operator!=(const UT_UTF8String& s1, const std::string & s2);
00283 ABI_EXPORT bool operator==(const std::string & s2, const UT_UTF8String& s1);
00284 ABI_EXPORT bool operator!=(const std::string & s2, const UT_UTF8String& s1);
00285 ABI_EXPORT UT_UTF8String operator+(const UT_UTF8String & s1, const UT_UTF8String & s2);
00286 ABI_EXPORT UT_UTF8String UT_UTF8String_sprintf(const char * inFormat, ...);
00287 ABI_EXPORT UT_UTF8String & UT_UTF8String_sprintf(UT_UTF8String & inStr, const char * inFormat, ...);
00288 
00289 
00290 /***************************************************************************/
00295 ABI_EXPORT UT_UTF8String UT_UTF8String_getPropVal(const UT_UTF8String & sPropertyString, const UT_UTF8String & sProp);
00296 
00297 ABI_EXPORT void UT_UTF8String_removeProperty(UT_UTF8String & sPropertyString, const UT_UTF8String & sProp);
00298 
00299 ABI_EXPORT void UT_UTF8String_setProperty(UT_UTF8String & sPropertyString, const UT_UTF8String &sProp, const UT_UTF8String & sVal);
00300 
00301 ABI_EXPORT void UT_UTF8String_addPropertyString(UT_UTF8String & sPropertyString, const UT_UTF8String & sNewProp);
00302 
00303 ABI_EXPORT void UT_UTF8String_replaceString(UT_UTF8String & sString, const UT_UTF8String & sOldValue,const UT_UTF8String & sNewValue );
00304 
00306 //
00307 //  UCS-4 string
00308 //
00309 //  String is built of 32-bit units (longs)
00310 //
00311 //  NOTE: Ambiguity between UCS-2 and UTF-16 above makes no difference
00312 //  NOTE:  in the case of UCS-4 and UTF-32 since they really are
00313 //  NOTE:  identical
00314 //
00316 
00318 //  UT_UCS4String, a simple wrapper for zero terminated 'UCS4' strings.
00319 //
00320 
00321 // TODO: add c_str(), encoded_str(const char * to)
00322 
00323 class ABI_EXPORT UT_UCS4String
00324 {
00325 public:
00326     UT_UCS4String();
00327     UT_UCS4String(const UT_UCS4Char * sz, size_t n = 0 /* 0 == zero-terminate */);
00328     UT_UCS4String(const UT_UCS4String& rhs);
00329 
00330     /* construct from a string in UTF-8 format
00331      */
00332     UT_UCS4String(const char * utf8_str, size_t bytelength = 0 /* 0 == zero-terminate */);
00333     UT_UCS4String(const std::string & str /* zero-terminated utf-8 encoded */);
00334 
00335     /* construct from a string in UTF-8 format
00336      * if (strip_whitespace == true) replace all white space sequences with a single UCS_SPACE
00337      * if (strip_whitespace != true) replace CR-LF & CR by LF
00338      * non-breaking spaces (&nbsp; UCS_NBSP 0x0a) are not white space; see UT_UCS4_isspace()
00339      */
00340     UT_UCS4String(const char * utf8_str, size_t bytelength /* 0 == zero-terminate */, bool strip_whitespace);
00341 
00342     ~UT_UCS4String();
00343 
00344     size_t  size() const;
00345     size_t length () const { return size () ; }
00346 
00347     void            reserve(size_t n);
00348     bool        empty() const;
00349     void        clear() const;
00350 
00351     UT_UCS4String   substr(size_t iStart, size_t nChars) const;
00352     UT_UCS4String   substr(size_t iStart) const;
00353     UT_UCS4String   substr( const UT_UCS4Char* iter ) const;
00354 
00355     UT_UCS4String&  operator=(const UT_UCS4String&  rhs);
00356     UT_UCS4String&  operator=(const UT_UCS4Char *   rhs);
00357     UT_UCS4String&  operator+=(const UT_UCS4String& rhs);
00358     UT_UCS4String&  operator+=(const UT_UCS4Char *  rhs);
00359     UT_UCS4String&  operator+=(UT_UCS4Char rhs);
00360     UT_UCS4String&  operator+=(char rhs);
00361     UT_UCS4String&  operator+=(unsigned char rhs);
00362 
00363     UT_UCS4Char     operator[](size_t iPos) const;
00364     UT_UCS4Char&    operator[](size_t iPos);
00365 
00366     void        swap(UT_UCS4String& rhs);
00367 
00368     // The returned pointer is valid until the next non-const
00369     // operation. You will _always_ get a legal pointer back,
00370     // even if to an empty (0) string.
00371     const UT_UCS4Char* ucs4_str() const;
00372 
00373     // The same valid constraints as ucs4_str() applies to begin and end
00374     const UT_UCS4Char* begin() const;
00375     const UT_UCS4Char* end()   const;
00376 
00377     const char * utf8_str ();
00378 
00379 private:
00380     void _loadUtf8(const char * utf8_str, size_t bytelength); // implementation detail for the UTF-8 constructor
00381     class UT_StringImpl<UT_UCS4Char>* pimpl;
00382 };
00383 
00384 // helpers
00385 bool operator==(const UT_UCS4String& s1, const UT_UCS4String& s2);
00386 bool operator==(const UT_UCS4String& s1, const UT_UCS4Char *  s2);
00387 bool operator==(const UT_UCS4Char *  s1, const UT_UCS4String& s2);
00388 bool operator!=(const UT_UCS4String& s1, const UT_UCS4String& s2);
00389 bool operator!=(const UT_UCS4String& s1, const UT_UCS4Char *  s2);
00390 bool operator!=(const UT_UCS4Char *  s1, const UT_UCS4String& s2);
00391 
00392 // strcmp ordering
00393 bool operator<(const UT_UCS4String& s1, const UT_UCS4String& s2);
00394 
00395 UT_UCS4String operator+(const UT_UCS4String& s1, const UT_UCS4String& s2);
00396 
00397 
00398 
00399 #endif  // UT_STRING_CLASS_H

Generated on Sun Feb 14 2021 for AbiWord by  doxygen 1.7.1