ustring seems to be working! yay!
This commit is contained in:
parent
ef02a0c8ae
commit
55f1381860
2 changed files with 59 additions and 34 deletions
|
@ -14,18 +14,48 @@ namespace otk {
|
||||||
|
|
||||||
// helper functions
|
// helper functions
|
||||||
|
|
||||||
static ustring::size_type utf8_find_offset(const char *str, const char *pos)
|
// takes a pointer into a utf8 string and returns a unicode character for the
|
||||||
|
// first character at the pointer
|
||||||
|
unichar utf8_get_char (const char *p)
|
||||||
|
{
|
||||||
|
unichar result = static_cast<unsigned char>(*p);
|
||||||
|
|
||||||
|
// if its not a 7-bit ascii character
|
||||||
|
if((result & 0x80) != 0) {
|
||||||
|
// len is the number of bytes this character takes up in the string
|
||||||
|
unsigned char len = utf8_skip[result];
|
||||||
|
result &= 0x7F >> len;
|
||||||
|
|
||||||
|
while(--len != 0) {
|
||||||
|
result <<= 6;
|
||||||
|
result |= static_cast<unsigned char>(*++p) & 0x3F;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
// takes a pointer into a string and finds its offset
|
||||||
|
static ustring::size_type utf8_ptr_to_offset(const char *str, const char *pos)
|
||||||
{
|
{
|
||||||
ustring::size_type offset = 0;
|
ustring::size_type offset = 0;
|
||||||
|
|
||||||
while (str < pos) {
|
while (str < pos) {
|
||||||
str += g_utf8_skip[*str];
|
str += utf8_skip[*str];
|
||||||
offset += g_utf8_skip[*str];
|
offset++;
|
||||||
}
|
}
|
||||||
|
|
||||||
return offset;
|
return offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// takes an offset into a string and returns a pointer to it
|
||||||
|
const char *utf8_offset_to_ptr(const char *str, ustring::size_type offset)
|
||||||
|
{
|
||||||
|
while (offset--)
|
||||||
|
str += utf8_skip[*str];
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
// First overload: stop on '\0' character.
|
// First overload: stop on '\0' character.
|
||||||
ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset)
|
ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset)
|
||||||
{
|
{
|
||||||
|
@ -39,7 +69,7 @@ ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset)
|
||||||
if(*p == '\0')
|
if(*p == '\0')
|
||||||
return ustring::npos;
|
return ustring::npos;
|
||||||
|
|
||||||
p += g_utf8_skip[*p];
|
p += utf8_skip[*p];
|
||||||
}
|
}
|
||||||
|
|
||||||
return (p - str);
|
return (p - str);
|
||||||
|
@ -60,7 +90,7 @@ ustring::size_type utf8_byte_offset(const char* str, ustring::size_type offset,
|
||||||
if(p >= pend)
|
if(p >= pend)
|
||||||
return ustring::npos;
|
return ustring::npos;
|
||||||
|
|
||||||
p += g_utf8_skip[*p];
|
p += utf8_skip[*p];
|
||||||
}
|
}
|
||||||
|
|
||||||
return (p - str);
|
return (p - str);
|
||||||
|
@ -122,7 +152,7 @@ ustring::size_type ustring::size() const
|
||||||
{
|
{
|
||||||
if (_utf8) {
|
if (_utf8) {
|
||||||
const char *const pdata = _string.data();
|
const char *const pdata = _string.data();
|
||||||
return utf8_find_offset(pdata, pdata + _string.size());
|
return utf8_ptr_to_offset(pdata, pdata + _string.size());
|
||||||
} else
|
} else
|
||||||
return _string.size();
|
return _string.size();
|
||||||
}
|
}
|
||||||
|
@ -181,6 +211,11 @@ void ustring::resize(ustring::size_type n, char c)
|
||||||
_string.resize(n, c);
|
_string.resize(n, c);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ustring::value_type ustring::operator[](ustring::size_type i) const
|
||||||
|
{
|
||||||
|
return utf8_get_char(utf8_offset_to_ptr(_string.data(), i));
|
||||||
|
}
|
||||||
|
|
||||||
const char* ustring::data() const
|
const char* ustring::data() const
|
||||||
{
|
{
|
||||||
return _string.data();
|
return _string.data();
|
||||||
|
|
|
@ -7,7 +7,7 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
extern "C" {
|
extern "C" {
|
||||||
/*
|
|
||||||
#ifdef HAVE_STDINT_H
|
#ifdef HAVE_STDINT_H
|
||||||
# include <stdint.h>
|
# include <stdint.h>
|
||||||
#else
|
#else
|
||||||
|
@ -15,25 +15,25 @@ extern "C" {
|
||||||
# include <sys/types.h>
|
# include <sys/types.h>
|
||||||
# endif
|
# endif
|
||||||
#endif
|
#endif
|
||||||
*/
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#include <string>
|
#include <string>
|
||||||
|
|
||||||
namespace otk {
|
namespace otk {
|
||||||
|
|
||||||
/*
|
|
||||||
#ifdef HAVE_STDINT_H
|
#ifdef HAVE_STDINT_H
|
||||||
typedef uint32_t unichar;
|
typedef uint32_t unichar;
|
||||||
#else
|
#else
|
||||||
typedef u_int32_t unichar;
|
typedef u_int32_t unichar;
|
||||||
#endif
|
#endif
|
||||||
*/
|
|
||||||
|
|
||||||
#ifndef DOXYGEN_IGNORE
|
#ifndef DOXYGEN_IGNORE
|
||||||
|
|
||||||
//! The number of bytes to skip to find the next character in the string
|
//! The number of bytes to skip to find the next character in the string
|
||||||
const char g_utf8_skip[256] = {
|
const char utf8_skip[256] = {
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
|
||||||
|
@ -44,6 +44,8 @@ const char g_utf8_skip[256] = {
|
||||||
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
|
3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,6,6,1,1
|
||||||
};
|
};
|
||||||
|
|
||||||
|
unichar utf8_get_char(const char *p);
|
||||||
|
|
||||||
#endif // DOXYGEN_IGNORE
|
#endif // DOXYGEN_IGNORE
|
||||||
|
|
||||||
//! The iterator type for ustring
|
//! The iterator type for ustring
|
||||||
|
@ -57,13 +59,13 @@ const char g_utf8_skip[256] = {
|
||||||
write operation would invalidate all other iterators pointing into the same
|
write operation would invalidate all other iterators pointing into the same
|
||||||
string.
|
string.
|
||||||
*/
|
*/
|
||||||
/*
|
|
||||||
template <class T>
|
template <class T>
|
||||||
class ustring_Iterator
|
class ustring_Iterator
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
typedef std::bidirectional_iterator_tag iterator_category;
|
typedef std::bidirectional_iterator_tag iterator_category;
|
||||||
//typedef unichar value_type;
|
typedef unichar value_type;
|
||||||
typedef std::string::difference_type difference_type;
|
typedef std::string::difference_type difference_type;
|
||||||
//typedef value_type reference;
|
//typedef value_type reference;
|
||||||
typedef void pointer;
|
typedef void pointer;
|
||||||
|
@ -74,26 +76,9 @@ public:
|
||||||
|
|
||||||
|
|
||||||
inline value_type operator*() const {
|
inline value_type operator*() const {
|
||||||
// get a unicode character from the iterator's position
|
|
||||||
|
|
||||||
// get an iterator to the internal string
|
// get an iterator to the internal string
|
||||||
std::string::const_iterator pos = _pos;
|
std::string::const_iterator pos = _pos;
|
||||||
|
return utf8_get_char(&(*pos));
|
||||||
unichar result = static_cast<unsigned char>(*pos);
|
|
||||||
|
|
||||||
// if its not a 7-bit ascii character
|
|
||||||
if((result & 0x80) != 0) {
|
|
||||||
// len is the number of bytes this character takes up in the string
|
|
||||||
unsigned char len = g_utf8_skip[result];
|
|
||||||
result &= 0x7F >> len;
|
|
||||||
|
|
||||||
while(--len != 0) {
|
|
||||||
result <<= 6;
|
|
||||||
result |= static_cast<unsigned char>(*++pos) & 0x3F;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -112,7 +97,7 @@ public:
|
||||||
private:
|
private:
|
||||||
T _pos;
|
T _pos;
|
||||||
};
|
};
|
||||||
*/
|
|
||||||
|
|
||||||
//! This class provides a simple wrapper to a std::string that can be encoded
|
//! This class provides a simple wrapper to a std::string that can be encoded
|
||||||
//! as UTF-8. The ustring::utf() member specifies if the given string is UTF-8
|
//! as UTF-8. The ustring::utf() member specifies if the given string is UTF-8
|
||||||
|
@ -135,7 +120,7 @@ public:
|
||||||
typedef std::string::size_type size_type;
|
typedef std::string::size_type size_type;
|
||||||
typedef std::string::difference_type difference_type;
|
typedef std::string::difference_type difference_type;
|
||||||
|
|
||||||
//typedef unichar value_type;
|
typedef unichar value_type;
|
||||||
//typedef unichar & reference;
|
//typedef unichar & reference;
|
||||||
//typedef const unichar & const_reference;
|
//typedef const unichar & const_reference;
|
||||||
|
|
||||||
|
@ -177,6 +162,11 @@ public:
|
||||||
|
|
||||||
void resize(size_type n, char c='\0');
|
void resize(size_type n, char c='\0');
|
||||||
|
|
||||||
|
// extract characters
|
||||||
|
|
||||||
|
// No reference return; use replace() to write characters.
|
||||||
|
value_type operator[](size_type i) const;
|
||||||
|
|
||||||
// internal data
|
// internal data
|
||||||
|
|
||||||
const char* data() const;
|
const char* data() const;
|
||||||
|
|
Loading…
Reference in a new issue