From 711e499c703c5bd3a12183c2cbbd3910c7aba99b Mon Sep 17 00:00:00 2001 From: Dana Jansens Date: Sun, 12 Jan 2003 23:55:59 +0000 Subject: [PATCH] start on otk::ustring (unicode/utf8) --- otk/ustring.cc | 44 +++++++++++++++ otk/ustring.hh | 148 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 192 insertions(+) create mode 100644 otk/ustring.cc create mode 100644 otk/ustring.hh diff --git a/otk/ustring.cc b/otk/ustring.cc new file mode 100644 index 00000000..571f9773 --- /dev/null +++ b/otk/ustring.cc @@ -0,0 +1,44 @@ +// -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 2; -*- + +#ifdef HAVE_CONFIG_H +# include "../config.h" +#endif // HAVE_CONFIG_H + +#include "ustring.hh" + +extern "C" { +#include +} + +namespace otk { + +ustring::ustring() +{ +} + +ustring::~ustring() +{ +} + +ustring::ustring(const ustring& other) + : _string(other._string) +{ +} + +ustring& ustring::operator=(const ustring& other) +{ + _string = other._string; + return *this; +} + +ustring::ustring(const std::string& src) + : _string(src) +{ +} + +ustring::ustring(const char* src) + : _string(src) +{ +} + +} diff --git a/otk/ustring.hh b/otk/ustring.hh new file mode 100644 index 00000000..5d011cfc --- /dev/null +++ b/otk/ustring.hh @@ -0,0 +1,148 @@ +// -*- mode: C++; indent-tabs-mode: nil; c-basic-offset: 2; -*- +#ifndef __ustring_hh +#define __ustring_hh + +/*! @file ustring.hh + @brief Provides a simple UTF-8 encoded string +*/ + +extern "C" { +#ifdef HAVE_STDINT_H +# include +#else +# ifdef HAVE_SYS_TYPES_H +# include +# endif +#endif +} + +#include + +namespace otk { + +//! The number of bytes to skip to find the next character in the string +const char g_utf8_skip[256] = { + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1 +}; + +#ifdef HAVE_STDINT_H +typedef uint32_t unichar; +#else +typedef u_int32_t unichar; +#endif + +//! The iterator type for ustring +/*! + Note this is not a random access iterator but a bidirectional one, since all + index operations need to iterate over the UTF-8 data. Use std::advance() to + move to a certain position. +

+ A writeable iterator isn't provided because: The number of bytes of the old + UTF-8 character and the new one to write could be different. Therefore, any + write operation would invalidate all other iterators pointing into the same + string. +*/ +template +class ustring_Iterator +{ +public: + typedef std::bidirectional_iterator_tag iterator_category; + typedef unichar value_type; + typedef std::string::difference_type difference_type; + typedef value_type reference; + typedef void pointer; + + inline ustring_Iterator() {} + inline ustring_Iterator(const ustring_Iterator& + other) : _pos(other.base()) {} + + inline value_type operator*() const { + // get a unicode character from the iterator's position + + // get an iterator to the internal string + std::string::const_iterator pos = _pos; + + unichar result = static_cast(*pos); + + // if its not a 7-bit ascii character + if((result & 0x80) != 0) { + // len is the number of bytes this character takes up in the string + unsigned char len = g_utf8_skip[result]; + result &= 0x7F >> len; + + while(--len != 0) { + result <<= 6; + result |= static_cast(*++pos) & 0x3F; + } + } + + return result; + } + + inline ustring_Iterator & operator++() { + pos_ += g_utf8_skip[static_cast(*pos_)]; + return *this; + } + inline ustring_Iterator & operator--() { + do { --_pos; } while((*_pos & '\xC0') == '\x80'); + return *this; + } + + explicit inline ustring_Iterator(T pos) : _pos(pos) {} + inline T base() const { return _pos; } + +private: + T _pos; +}; + +//! This class provides a simple wrapper to a std::string that is encoded as +//! UTF-8. +/*! + This class does not handle extended 8-bit ASCII charsets like + ISO-8859-1. +

+ More info on Unicode and UTF-8 can be found here: + http://www.cl.cam.ac.uk/~mgk25/unicode.html +

+ This does not subclass std::string, because std::string was intended to be a + final class. For instance, it does not have a virtual destructor. +*/ +class ustring { + std::string _string; + +public: + typedef std::string::size_type size_type; + typedef std::string::difference_type difference_type; + + typedef unichar value_type; + typedef unichar & reference; + typedef const unichar & const_reference; + + typedef ustring_Iterator iterator; + typedef ustring_Iterator const_iterator; + + static const size_type npos = std::string::npos; + + ustring(); + ~ustring(); + + // make new strings + + ustring(const ustring& other); + ustring& operator=(const ustring& other); + ustring(const std::string& src); + ustring::ustring(const char* src); + + +}; + +} + +#endif // __ustring_hh