ustring seems to be working! yay!
This commit is contained in:
parent
ef02a0c8ae
commit
55f1381860
2 changed files with 59 additions and 34 deletions
|
@ -14,18 +14,48 @@ namespace otk {
|
|||
|
||||
// helper functions
|
||||
|
||||
static ustring::size_type utf8_find_offset(const char *str, const char *pos)
|
||||
// takes a pointer into a utf8 string and returns a unicode character for the
|
||||
// first character at the pointer
|
||||
unichar utf8_get_char (const char *p)
|
||||
{
|
||||
unichar result = static_cast<unsigned char>(*p);
|
||||
|
||||
// if its not a 7-bit ascii character
|
||||
if((result & 0x80) != 0) {
|
||||
// len is the number of bytes this character takes up in the string
|
||||
unsigned char len = utf8_skip[result];
|
||||
result &= 0x7F >> len;
|
||||
|
||||
while(--len != 0) {
|
||||
result <<= 6;
|
||||
result |= static_cast<unsigned char>(*++p) & 0x3F;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
// takes a pointer into a string and finds its offset
|
||||
static ustring::size_type utf8_ptr_to_offset(const char *str, const char *pos)
|
||||
{
|
||||
ustring::size_type offset = 0;
|
||||
|
||||
while (str < pos) {
|
||||
str += g_utf8_skip[*str];
|
||||
offset += g_utf8_skip[*str];
|
||||
str += utf8_skip[*str];
|
||||
offset++;
|
||||
}
|
||||
|
||||
return offset;
|
||||
}
|
||||
|
||||
// takes an offset into a string and returns a pointer to it
|
||||
const char *utf8_offset_to_ptr(const char *str, ustring::size_type offset)
|
||||
{
|
||||
while (offset--)
|
||||
str += utf8_skip[*str];
|
||||
return str;
|
||||
}
|
||||
|
||||
// First overload: stop on '\0' character.
|
||||
ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset)
|
||||
{
|
||||
|
@ -39,7 +69,7 @@ ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset)
|
|||
if(*p == '\0')
|
||||
return ustring::npos;
|
||||
|
||||
p += g_utf8_skip[*p];
|
||||
p += utf8_skip[*p];
|
||||
}
|
||||
|
||||
return (p - str);
|
||||
|
@ -60,7 +90,7 @@ ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset,
|
|||
if(p >= pend)
|
||||
return ustring::npos;
|
||||
|
||||
p += g_utf8_skip[*p];
|
||||
p += utf8_skip[*p];
|
||||
}
|
||||
|
||||
return (p - str);
|
||||
|
@ -122,7 +152,7 @@ ustring::size_type ustring::size() const
|
|||
{
|
||||
if (_utf8) {
|
||||
const char *const pdata = _string.data();
|
||||
return utf8_find_offset(pdata, pdata + _string.size());
|
||||
return utf8_ptr_to_offset(pdata, pdata + _string.size());
|
||||
} else
|
||||
return _string.size();
|
||||
}
|
||||
|
@ -181,6 +211,11 @@ void ustring::resize(ustring::size_type n, char c)
|
|||
_string.resize(n, c);
|
||||
}
|
||||
|
||||
ustring::value_type ustring::operator[](ustring::size_type i) const
|
||||
{
|
||||
return utf8_get_char(utf8_offset_to_ptr(_string.data(), i));
|
||||
}
|
||||
|
||||
const char* ustring::data() const
|
||||
{
|
||||
return _string.data();
|
||||
|
|
|
@ -7,7 +7,7 @@
|
|||
*/
|
||||
|
||||
extern "C" {
|
||||
/*
|
||||
|
||||
#ifdef HAVE_STDINT_H
|
||||
# include <stdint.h>
|
||||
#else
|
||||
|
@ -15,25 +15,25 @@ extern "C" {
|
|||
# include <sys/types.h>
|
||||
# endif
|
||||
#endif
|
||||
*/
|
||||
|
||||
}
|
||||
|
||||
#include <string>
|
||||
|
||||
namespace otk {
|
||||
|
||||
/*
|
||||
|
||||
#ifdef HAVE_STDINT_H
|
||||
typedef uint32_t unichar;
|
||||
#else
|
||||
typedef u_int32_t unichar;
|
||||
#endif
|
||||
*/
|
||||
|
||||
|
||||
#ifndef DOXYGEN_IGNORE
|
||||
|
||||
//! The number of bytes to skip to find the next character in the string
|
||||
const char g_utf8_skip[256] = {
|
||||
const char utf8_skip[256] = {
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||
|
@ -44,6 +44,8 @@ const char g_utf8_skip[256] = {
|
|||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
|
||||
};
|
||||
|
||||
unichar utf8_get_char(const char *p);
|
||||
|
||||
#endif // DOXYGEN_IGNORE
|
||||
|
||||
//! The iterator type for ustring
|
||||
|
@ -57,13 +59,13 @@ const char g_utf8_skip[256] = {
|
|||
write operation would invalidate all other iterators pointing into the same
|
||||
string.
|
||||
*/
|
||||
/*
|
||||
|
||||
template <class T>
|
||||
class ustring_Iterator
|
||||
{
|
||||
public:
|
||||
typedef std::bidirectional_iterator_tag iterator_category;
|
||||
//typedef unichar value_type;
|
||||
typedef unichar value_type;
|
||||
typedef std::string::difference_type difference_type;
|
||||
//typedef value_type reference;
|
||||
typedef void pointer;
|
||||
|
@ -74,26 +76,9 @@ public:
|
|||
|
||||
|
||||
inline value_type operator*() const {
|
||||
// get a unicode character from the iterator's position
|
||||
|
||||
// get an iterator to the internal string
|
||||
std::string::const_iterator pos = _pos;
|
||||
|
||||
unichar result = static_cast<unsigned char>(*pos);
|
||||
|
||||
// if its not a 7-bit ascii character
|
||||
if((result & 0x80) != 0) {
|
||||
// len is the number of bytes this character takes up in the string
|
||||
unsigned char len = g_utf8_skip[result];
|
||||
result &= 0x7F >> len;
|
||||
|
||||
while(--len != 0) {
|
||||
result <<= 6;
|
||||
result |= static_cast<unsigned char>(*++pos) & 0x3F;
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
return utf8_get_char(&(*pos));
|
||||
}
|
||||
|
||||
|
||||
|
@ -112,7 +97,7 @@ public:
|
|||
private:
|
||||
T _pos;
|
||||
};
|
||||
*/
|
||||
|
||||
|
||||
//! This class provides a simple wrapper to a std::string that can be encoded
|
||||
//! as UTF-8. The ustring::utf() member specifies if the given string is UTF-8
|
||||
|
@ -135,7 +120,7 @@ public:
|
|||
typedef std::string::size_type size_type;
|
||||
typedef std::string::difference_type difference_type;
|
||||
|
||||
//typedef unichar value_type;
|
||||
typedef unichar value_type;
|
||||
//typedef unichar & reference;
|
||||
//typedef const unichar & const_reference;
|
||||
|
||||
|
@ -177,6 +162,11 @@ public:
|
|||
|
||||
void resize(size_type n, char c='\0');
|
||||
|
||||
// extract characters
|
||||
|
||||
// No reference return; use replace() to write characters.
|
||||
value_type operator[](size_type i) const;
|
||||
|
||||
// internal data
|
||||
|
||||
const char* data() const;
|
||||
|
|
Loading…
Reference in a new issue