• Main Page
  • Related Pages
  • Namespaces
  • Classes
  • Files
  • File List
  • File Members

ut_string_class.h

Go to the documentation of this file.
00001 // ut_string_class.h
00002 //
00003 // A simple string class for use where templates are not
00004 // allowed.
00005 //
00006 #ifndef UT_STRING_CLASS_H
00007 #define UT_STRING_CLASS_H
00008 
00009 //
00010 // Copyright (C) 2001 Mike Nordell <tamlin@algonet.se>
00011 // Copyright (C) 2001 Dom Lachowicz <dominicl@seas.upenn.edu>
00012 // Copyright (C) 2002 Tomas Frydrych <tomas@frydrych.uklinux.net>
00013 //
00014 // This class is free software; you can redistribute it and/or
00015 // modify it under the terms of the GNU General Public License
00016 // as published by the Free Software Foundation; either version 2
00017 // of the License, or (at your option) any later version.
00018 //
00019 // This class is distributed in the hope that it will be useful,
00020 // but WITHOUT ANY WARRANTY; without even the implied warranty of
00021 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00022 // GNU General Public License for more details.
00023 //
00024 // You should have received a copy of the GNU General Public License
00025 // along with this program; if not, write to the Free Software
00026 // Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
00027 // 02111-1307, USA.
00028 //
00029 
00030 #include <stdlib.h>
00031 #include <stdarg.h>
00032 
00033 #if defined(__MINGW32__)
00034 #  undef snprintf
00035 #  if __GNUC__ <= 3
00036 #    define _GLIBCXX_USE_C99_DYNAMIC 1
00037 #  endif
00038 #endif
00039 
00040 #include <string>
00041 
00042 /* pre-emptive dismissal; ut_types.h is needed by just about everything,
00043  * so even if it's commented out in-file that's still a lot of work for
00044  * the preprocessor to do...
00045  */
00046 #ifndef UT_TYPES_H
00047 #include "ut_types.h"
00048 #endif
00049 #include "ut_string.h"
00050 #include "ut_stringbuf.h"
00051 
00052 // Forward declarations
00053 class UT_ByteBuf;
00054 class UT_UCS4_mbtowc;
00055 class UT_String;
00056 class UT_UTF8String;
00057 class UT_UCS4String;
00058 
00059 
00060 // yes, this is screaming for a template
00061 
00063 //
00064 //  8-bit string
00065 //
00066 //  String is built of 8-bit units (bytes)
00067 //  Encoding could be any single-byte or multi-byte encoding
00068 //
00070 
00072 //  UT_String, a simple wrapper for zero terminated 'char' strings.
00073 //
00074 class ABI_EXPORT UT_String
00075 {
00076 public:
00077     UT_String();
00078     UT_String(const char* sz, size_t n = 0 /* 0 == zero-terminate */);
00079     UT_String(const UT_String& rhs);
00080     UT_String(const std::basic_string<char> &s);
00081     ~UT_String();
00082 
00083     size_t      size() const;
00084     size_t length () const { return size () ; }
00085     void            reserve(size_t n);
00086     bool        empty() const;
00087     void        clear() const;
00088 
00089     UT_String   substr(size_t iStart, size_t nChars) const;
00090 
00091     UT_String&  operator=(const UT_String& rhs);
00092     UT_String&  operator=(const char*      rhs);
00093     UT_String&  operator=(const std::basic_string<char> & rhs);
00094     UT_String&  operator+=(const UT_String& rhs);
00095     UT_String&  operator+=(const char*      rhs);
00096     UT_String&  operator+=(char rhs);
00097 
00098     char        operator[](size_t iPos) const;
00099     char&       operator[](size_t iPos);
00100 
00101     void        swap(UT_String& rhs);
00102 
00103     // The returned pointer is valid until the next non-const
00104     // operation. You will _always_ get a legal pointer back,
00105     // even if to an empty string.
00106     const char* c_str() const;
00107 
00108 private:
00109     class UT_StringImpl<char>* pimpl;
00110 };
00111 
00112 // helpers
00113 ABI_EXPORT bool operator==(const UT_String& s1, const UT_String& s2);
00114 ABI_EXPORT bool operator==(const UT_String& s1, const char*      s2);
00115 ABI_EXPORT bool operator==(const char*      s1, const UT_String& s2);
00116 ABI_EXPORT bool operator!=(const UT_String& s1, const UT_String& s2);
00117 ABI_EXPORT bool operator!=(const UT_String& s1, const char*      s2);
00118 ABI_EXPORT bool operator!=(const char*      s1, const UT_String& s2);
00119 
00120 ABI_EXPORT UT_uint32 hashcode(const UT_String& string);
00121 ABI_EXPORT UT_uint32 hashcode(const char *s);
00122 
00123 // strcmp ordering
00124 ABI_EXPORT bool operator<(const UT_String& s1, const UT_String& s2);
00125 
00126 ABI_EXPORT UT_String operator+(const UT_String& s1, const UT_String& s2);
00127 
00128 ABI_EXPORT size_t UT_String_findCh(const UT_String &st, char ch);
00129 ABI_EXPORT size_t UT_String_findRCh(const UT_String &st, char ch);
00130 
00131 /****************************************************************************/
00132 
00137 ABI_EXPORT UT_String& UT_String_sprintf(UT_String & inStr, const char * inFormat, ...) ABI_PRINTF_FORMAT(2,3);
00138 ABI_EXPORT UT_String& UT_String_vprintf (UT_String & inStr, const char *format,
00139                                          va_list      args1)
00140     ABI_PRINTF_FORMAT(2,0);
00141 ABI_EXPORT UT_String& UT_String_vprintf (UT_String & inStr, const UT_String & format,
00142                      va_list      args1);
00143 
00148 ABI_EXPORT UT_String UT_String_sprintf(const char * inFormat, ...)
00149     ABI_PRINTF_FORMAT(1,2);
00150 ABI_EXPORT UT_String UT_String_vprintf(const char * inFormat, va_list args1)
00151     ABI_PRINTF_FORMAT(1,0);
00152 ABI_EXPORT UT_String UT_String_vprintf(const UT_String & inFormat, va_list args1);
00153 
00154 /***************************************************************************/
00155 
00156 /***************************************************************************/
00161 ABI_EXPORT UT_String UT_String_getPropVal(const UT_String & sPropertyString, const UT_String & sProp);
00162 ABI_EXPORT void UT_String_removeProperty(UT_String & sPropertyString, const UT_String & sProp);
00163 ABI_EXPORT void UT_String_setProperty(UT_String & sPropertyString, const UT_String &sProp, const UT_String & sVal);
00164 ABI_EXPORT void UT_String_addPropertyString(UT_String & sPropertyString, const UT_String & sNewProp);
00165 
00167 //
00168 //  UTF-8 string: encoding is *always* UTF-8
00169 //
00171 
00173 //  UT_UTF8String, a simple wrapper for zero terminated 'UTF-8' strings.
00174 //
00175 
00176 class ABI_EXPORT UT_UTF8String
00177 {
00178 public:
00179     UT_UTF8String ();
00180     UT_UTF8String (const char * sz, size_t n = 0 /* 0 == null-termination */);
00181     UT_UTF8String (const char *sz, const char *encoding);
00182 
00183     UT_UTF8String (const UT_UTF8String & rhs);
00184     UT_UTF8String (const UT_UCS4String & rhs);
00185     UT_UTF8String (const UT_UCSChar * sz, size_t n = 0 /* 0 == zero-terminate */);
00186 
00187     ~UT_UTF8String ();
00188 
00189     size_t      size () const;
00190     size_t length () const { return size () ; }
00191 
00192     void            reserve(size_t n);
00193     bool        empty () const;
00194     void        clear () const;
00195     size_t      byteLength() const;
00196     void        dump(void) const;
00197     UT_UTF8String   substr(size_t iStart, size_t nChars) const;
00198 
00199     UT_UTF8String & operator=(const char *          rhs);
00200     UT_UTF8String & operator=(const std::string &   rhs);
00201     UT_UTF8String & operator=(const UT_UTF8String & rhs);
00202     UT_UTF8String & operator=(const UT_UCS4String & rhs);
00203 
00204     UT_UTF8String & operator+=(const UT_UCS4Char     rhs);
00205     UT_UTF8String & operator+=(const char *          rhs);
00206     UT_UTF8String & operator+=(const std::string &   rhs);
00207     UT_UTF8String & operator+=(const UT_UTF8String & rhs);
00208     UT_UTF8String & operator+=(const UT_UCS4String & rhs);
00209 
00210     // The returned pointer is valid until the next non-const
00211     // operation. You will _always_ get a legal pointer back,
00212     // even if to an empty (0) string.
00213     const char * utf8_str () const;
00214     UT_UCS4String ucs4_str ();
00215 
00216     void        assign (const char * sz, size_t n = 0 /* 0 == null-termination */);
00217     void        append (const char * sz, size_t n = 0 /* 0 == null-termination */);
00218     void        appendBuf (const UT_ByteBuf & buf, UT_UCS4_mbtowc & converter);
00219 
00220     void        appendUCS4 (const UT_UCS4Char * sz, size_t n = 0 /* 0 == null-termination */);
00221     void        appendUCS2 (const UT_UCS2Char * sz, size_t n = 0 /* 0 == null-termination */);
00222 
00223     const UT_UTF8String & escape (const UT_UTF8String & str1,
00224                       const UT_UTF8String & str2);  // replaces <str1> with <str2> in the current string
00225     const UT_UTF8String & escapeXML ();  // escapes '<', '>', '"', & '&' in the current string
00226     const UT_UTF8String & decodeXML ();  // unescapes '<', '>', '"', & '&' in the current string
00227     const UT_UTF8String & escapeMIME (); // translates the current string to MIME "quoted-printable" format
00228     const UT_UTF8String & lowerCase ();  // forces current string to lowercase
00229     const UT_UTF8String & escapeURL ();  // make URL confirm to RFC 1738
00230     const UT_UTF8String & decodeURL ();
00231 
00232     /* UTF8String - NOTES
00233      *
00234      * TODO:
00235      * 1. Maybe have a search&replace function, something like:
00236      *
00237      *  int replace (const char * utf_newstr, const char * utf_oldstr);
00238      *
00239      *    which could be used to do substitutions, e.g.:
00240      *
00241      *  UTF8String xmlstr = "expr: if ((c > 0) && (c < 0x80)) return c;";
00242      *  xmlstr.replace ("&lt;", "<");
00243      *  xmlstr.replace ("&gt;", ">");
00244      *  xmlstr.replace ("&amp;","&");
00245      *
00246      *  MIQ: Note that for these replace methods, one might use ut_std_string/replace_all()
00247      *
00248      *
00249      * getIterator:
00250      * returns a home-made iterator associated with the UTF-8 string, e.g.:
00251      *
00252      *  UTF8String str = "This is a UTF-8 string.";
00253      *  UT_UTF8Stringbuf::UTF8Iterator & iter = str.getIterator ();
00254      *  iter = iter.start (); // iter.start() returns 0 if no string, so:
00255      *  if (iter.current ())
00256      *  {
00257      *      while (true)
00258      *      {
00259      *          char * pUTF = iter.current ();
00260      *          if (*pUTF == 0) break; // end-of-string
00261      *          // etc.
00262      *          iter.advance (); // or ++iter;
00263      *      }
00264      *  }
00265      *
00266      * The iterator will be well behaved provided the string is not being edited.
00267      */
00268     UT_UTF8Stringbuf::UTF8Iterator getIterator () const
00269     {
00270         return UT_UTF8Stringbuf::UTF8Iterator(pimpl);
00271     }
00272 
00273 private:
00274     class UT_UTF8Stringbuf * pimpl;
00275 };
00276 
00277 ABI_EXPORT bool operator<(const UT_UTF8String& s1, const UT_UTF8String& s2);
00278 ABI_EXPORT bool operator==(const UT_UTF8String& s1, const UT_UTF8String& s2);
00279 ABI_EXPORT bool operator!=(const UT_UTF8String& s1, const UT_UTF8String& s2);
00280 ABI_EXPORT bool operator==(const UT_UTF8String& s1, const char * s2);
00281 ABI_EXPORT bool operator!=(const UT_UTF8String& s1, const char * s2);
00282 ABI_EXPORT bool operator==(const UT_UTF8String& s1, const std::string & s2);
00283 ABI_EXPORT bool operator!=(const UT_UTF8String& s1, const std::string & s2);
00284 ABI_EXPORT bool operator==(const std::string & s2, const UT_UTF8String& s1);
00285 ABI_EXPORT bool operator!=(const std::string & s2, const UT_UTF8String& s1);
00286 ABI_EXPORT UT_UTF8String operator+(const UT_UTF8String & s1, const UT_UTF8String & s2);
00287 ABI_EXPORT UT_UTF8String UT_UTF8String_sprintf(const char * inFormat, ...);
00288 ABI_EXPORT UT_UTF8String & UT_UTF8String_sprintf(UT_UTF8String & inStr, const char * inFormat, ...);
00289 
00290 
00291 /***************************************************************************/
00296 ABI_EXPORT UT_UTF8String UT_UTF8String_getPropVal(const UT_UTF8String & sPropertyString, const UT_UTF8String & sProp);
00297 
00298 ABI_EXPORT void UT_UTF8String_removeProperty(UT_UTF8String & sPropertyString, const UT_UTF8String & sProp);
00299 
00300 ABI_EXPORT void UT_UTF8String_setProperty(UT_UTF8String & sPropertyString, const UT_UTF8String &sProp, const UT_UTF8String & sVal);
00301 
00302 ABI_EXPORT void UT_UTF8String_addPropertyString(UT_UTF8String & sPropertyString, const UT_UTF8String & sNewProp);
00303 
00304 ABI_EXPORT void UT_UTF8String_replaceString(UT_UTF8String & sString, const UT_UTF8String & sOldValue,const UT_UTF8String & sNewValue );
00305 
00307 //
00308 //  UCS-4 string
00309 //
00310 //  String is built of 32-bit units (longs)
00311 //
00312 //  NOTE: Ambiguity between UCS-2 and UTF-16 above makes no difference
00313 //  NOTE:  in the case of UCS-4 and UTF-32 since they really are
00314 //  NOTE:  identical
00315 //
00317 
00319 //  UT_UCS4String, a simple wrapper for zero terminated 'UCS4' strings.
00320 //
00321 
00322 // TODO: add c_str(), encoded_str(const char * to)
00323 
00324 class ABI_EXPORT UT_UCS4String
00325 {
00326 public:
00327     UT_UCS4String();
00328     UT_UCS4String(const UT_UCS4Char * sz, size_t n = 0 /* 0 == zero-terminate */);
00329     UT_UCS4String(const UT_UCS4String& rhs);
00330 
00331     /* construct from a string in UTF-8 format
00332      */
00333     UT_UCS4String(const char * utf8_str, size_t bytelength = 0 /* 0 == zero-terminate */);
00334     UT_UCS4String(const std::string & str /* zero-terminated utf-8 encoded */);
00335 
00336     /* construct from a string in UTF-8 format
00337      * if (strip_whitespace == true) replace all white space sequences with a single UCS_SPACE
00338      * if (strip_whitespace != true) replace CR-LF & CR by LF
00339      * non-breaking spaces (&nbsp; UCS_NBSP 0x0a) are not white space; see UT_UCS4_isspace()
00340      */
00341     UT_UCS4String(const char * utf8_str, size_t bytelength /* 0 == zero-terminate */, bool strip_whitespace);
00342 
00343     ~UT_UCS4String();
00344 
00345     size_t  size() const;
00346     size_t length () const { return size () ; }
00347 
00348     void            reserve(size_t n);
00349     bool        empty() const;
00350     void        clear() const;
00351 
00352     UT_UCS4String   substr(size_t iStart, size_t nChars) const;
00353     UT_UCS4String   substr(size_t iStart) const;
00354     UT_UCS4String   substr( const UT_UCS4Char* iter ) const;
00355 
00356     UT_UCS4String&  operator=(const UT_UCS4String&  rhs);
00357     UT_UCS4String&  operator=(const UT_UCS4Char *   rhs);
00358     UT_UCS4String&  operator+=(const UT_UCS4String& rhs);
00359     UT_UCS4String&  operator+=(const UT_UCS4Char *  rhs);
00360     UT_UCS4String&  operator+=(UT_UCS4Char rhs);
00361     UT_UCS4String&  operator+=(char rhs);
00362     UT_UCS4String&  operator+=(unsigned char rhs);
00363 
00364     UT_UCS4Char     operator[](size_t iPos) const;
00365     UT_UCS4Char&    operator[](size_t iPos);
00366 
00367     void        swap(UT_UCS4String& rhs);
00368 
00369     // The returned pointer is valid until the next non-const
00370     // operation. You will _always_ get a legal pointer back,
00371     // even if to an empty (0) string.
00372     const UT_UCS4Char* ucs4_str() const;
00373 
00374     // The same valid constraints as ucs4_str() applies to begin and end
00375     const UT_UCS4Char* begin() const;
00376     const UT_UCS4Char* end()   const;
00377 
00378     const char * utf8_str ();
00379 
00380 private:
00381     void _loadUtf8(const char * utf8_str, size_t bytelength); // implementation detail for the UTF-8 constructor
00382     class UT_StringImpl<UT_UCS4Char>* pimpl;
00383 };
00384 
00385 // helpers
00386 bool operator==(const UT_UCS4String& s1, const UT_UCS4String& s2);
00387 bool operator==(const UT_UCS4String& s1, const UT_UCS4Char *  s2);
00388 bool operator==(const UT_UCS4Char *  s1, const UT_UCS4String& s2);
00389 bool operator!=(const UT_UCS4String& s1, const UT_UCS4String& s2);
00390 bool operator!=(const UT_UCS4String& s1, const UT_UCS4Char *  s2);
00391 bool operator!=(const UT_UCS4Char *  s1, const UT_UCS4String& s2);
00392 
00393 // strcmp ordering
00394 bool operator<(const UT_UCS4String& s1, const UT_UCS4String& s2);
00395 
00396 UT_UCS4String operator+(const UT_UCS4String& s1, const UT_UCS4String& s2);
00397 
00398 
00399 
00400 #endif  // UT_STRING_CLASS_H

Generated on Mon May 28 2012 for AbiWord by  doxygen 1.7.1