Qore Programming Language  0.8.9
 All Classes Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
QoreEncoding.h
Go to the documentation of this file.
1 /* -*- mode: c++; indent-tabs-mode: nil -*- */
2 /*
3  QoreEncoding.h
4 
5  Qore Programming Language
6 
7  Copyright 2003 - 2013 David Nichols
8 
9  This library is free software; you can redistribute it and/or
10  modify it under the terms of the GNU Lesser General Public
11  License as published by the Free Software Foundation; either
12  version 2.1 of the License, or (at your option) any later version.
13 
14  This library is distributed in the hope that it will be useful,
15  but WITHOUT ANY WARRANTY; without even the implied warranty of
16  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17  Lesser General Public License for more details.
18 
19  You should have received a copy of the GNU Lesser General Public
20  License along with this library; if not, write to the Free Software
21  Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
22 */
23 
24 #ifndef _QORE_CHARSET_H
25 
26 #define _QORE_CHARSET_H
27 
33 #include <qore/common.h>
34 #include <qore/QoreThreadLock.h>
35 
36 #include <strings.h>
37 #include <string.h>
38 
39 #include <map>
40 
41 #include <string>
42 
44 typedef qore_size_t (*mbcs_length_t)(const char* str, const char* end, bool &invalid);
45 
47 typedef qore_size_t (*mbcs_end_t)(const char* str, const char* end, qore_size_t num_chars, bool &invalid);
48 
50 typedef qore_size_t (*mbcs_pos_t)(const char* str, const char* ptr, bool &invalid);
51 
53 
57 typedef qore_size_t (*mbcs_charlen_t)(const char* str, qore_size_t valid_len);
58 
59 class ExceptionSink;
60 
62 
72 class QoreEncoding {
73 private:
74  std::string code;
75  std::string desc;
76  mbcs_length_t flength;
77  mbcs_end_t fend;
78  mbcs_pos_t fpos;
79  mbcs_charlen_t fcharlen;
80  unsigned char maxwidth;
81 
82 public:
83  DLLLOCAL QoreEncoding(const char* n_code, const char* n_desc = 0, unsigned char n_maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t c = 0) : code(n_code), desc(n_desc ? n_desc : ""), flength(l), fend(e), fpos(p), fcharlen(c), maxwidth(n_maxwidth) {
84  }
85 
86  DLLLOCAL ~QoreEncoding() {
87  }
88 
90 
95  DLLLOCAL qore_size_t getLength(const char* p, const char* end, bool &invalid) const {
96  return flength ? flength(p, end, invalid) : strlen(p);
97  }
98 
100 
105  DLLLOCAL qore_size_t getLength(const char* p, const char* end, ExceptionSink* xsink) const;
106 
108 
114  DLLLOCAL qore_size_t getByteLen(const char* p, const char* end, qore_size_t c, bool& invalid) const {
115  return fend ? fend(p, end, c, invalid) : c;
116  }
117 
119 
125  DLLLOCAL qore_size_t getByteLen(const char* p, const char* end, qore_size_t c, ExceptionSink* xsink) const;
126 
128 
133  DLLLOCAL qore_size_t getCharPos(const char* p, const char* end, bool& invalid) const {
134  return fpos ? fpos(p, end, invalid) : end - p;
135  }
136 
138 
143  DLLLOCAL qore_size_t getCharPos(const char* p, const char* end, ExceptionSink* xsink) const;
144 
146 
151  DLLLOCAL qore_size_t getCharLen(const char* p, qore_size_t valid_len) const {
152  return fcharlen ? fcharlen(p, valid_len) : 1;
153  }
154 
156  DLLLOCAL bool isMultiByte() const {
157  return (bool)flength;
158  }
159 
161  DLLLOCAL const char* getCode() const {
162  return code.c_str();
163  }
164 
166  DLLLOCAL const char* getDesc() const {
167  return desc.empty() ? "<no description available>" : desc.c_str();
168  }
169 
171  DLLLOCAL int getMaxCharWidth() const {
172  return maxwidth;
173  }
174 };
175 
176 // case-insensitive maps for encodings
177 typedef std::map<const char*, QoreEncoding*, class ltcstrcase> encoding_map_t;
178 typedef std::map<const char*, const QoreEncoding*, class ltcstrcase> const_encoding_map_t;
179 
180 class QoreString;
181 
183 
186 private:
187  DLLLOCAL static encoding_map_t emap;
188  DLLLOCAL static const_encoding_map_t amap;
189  DLLLOCAL static class QoreThreadLock mutex;
190 
191  DLLLOCAL static const QoreEncoding* addUnlocked(const char* code, const char* desc, unsigned char maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t = 0);
192  DLLLOCAL static const QoreEncoding* findUnlocked(const char* name);
193 
194 public:
196  DLLEXPORT static void addAlias(const QoreEncoding* qcs, const char* alias);
197 
199  DLLEXPORT static const QoreEncoding* findCreate(const char* name);
200 
202  DLLEXPORT static const QoreEncoding* findCreate(const QoreString* str);
203 
205  DLLEXPORT static void showEncodings();
206 
208  DLLEXPORT static void showAliases();
209 
211  DLLEXPORT static const QoreEncoding* add(const char* code, const char* desc = 0, unsigned char maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t = 0);
212 
213  DLLLOCAL static void init(const char* def);
214  DLLLOCAL QoreEncodingManager();
215  DLLLOCAL ~QoreEncodingManager();
216 };
217 
218 DLLEXPORT qore_size_t q_get_byte_len(const QoreEncoding* enc, const char* p, const char* end, qore_size_t c, ExceptionSink* xsink);
219 DLLEXPORT qore_size_t q_get_char_len(const QoreEncoding* enc, const char* p, qore_size_t valid_len, ExceptionSink* xsink);
220 
222 DLLEXPORT extern QoreEncodingManager QEM;
223 
224 // builtin character encodings
225 DLLEXPORT extern const QoreEncoding* QCS_DEFAULT,
226  *QCS_USASCII,
227  *QCS_UTF8,
228  *QCS_ISO_8859_1,
229  *QCS_ISO_8859_2,
230  *QCS_ISO_8859_3,
231  *QCS_ISO_8859_4,
232  *QCS_ISO_8859_5,
233  *QCS_ISO_8859_6,
234  *QCS_ISO_8859_7,
235  *QCS_ISO_8859_8,
236  *QCS_ISO_8859_9,
237  *QCS_ISO_8859_10,
238  *QCS_ISO_8859_11,
239  *QCS_ISO_8859_13,
240  *QCS_ISO_8859_14,
241  *QCS_ISO_8859_15,
242  *QCS_ISO_8859_16,
243  *QCS_KOI8_R,
244  *QCS_KOI8_U,
245  *QCS_KOI7;
246 
248 DLLEXPORT qore_size_t q_UTF8_get_char_len(const char* p, qore_size_t valid_len);
249 
250 #endif // _QORE_CHARSET_H
DLLLOCAL const char * getDesc() const
returns the description for the encoding
Definition: QoreEncoding.h:166
DLLEXPORT const QoreEncoding * QCS_UTF8
UTF-8 multi-byte encoding (the only multi-byte encoding, all others are single-byte encodings) ...
DLLLOCAL int getMaxCharWidth() const
returns the maximum character width in bytes for the encoding
Definition: QoreEncoding.h:171
defines string encoding functions in Qore
Definition: QoreEncoding.h:72
DLLEXPORT const QoreEncoding * QCS_ISO_8859_8
Hebrew character set.
DLLEXPORT const QoreEncoding * QCS_ISO_8859_1
latin-1, Western European encoding
DLLEXPORT const QoreEncoding * QCS_DEFAULT
the default encoding for the Qore library
DLLLOCAL qore_size_t getLength(const char *p, const char *end, bool &invalid) const
gives the length of the string in characters
Definition: QoreEncoding.h:95
static DLLEXPORT void addAlias(const QoreEncoding *qcs, const char *alias)
adds an alias for an encoding
DLLEXPORT const QoreEncoding * QCS_ISO_8859_2
latin-2, Central European encoding
DLLEXPORT const QoreEncoding * QCS_ISO_8859_11
Thai character set.
manages encodings in Qore
Definition: QoreEncoding.h:185
DLLEXPORT const QoreEncoding * QCS_ISO_8859_3
latin-3, Southern European character set
size_t qore_size_t
used for sizes (same range as a pointer)
Definition: common.h:62
qore_size_t(* mbcs_charlen_t)(const char *str, qore_size_t valid_len)
for multi-byte encodings: gives the number of total bytes for the character given one or more charact...
Definition: QoreEncoding.h:57
DLLEXPORT const QoreEncoding * QCS_ISO_8859_4
latin-4, Northern European character set
static DLLEXPORT void showEncodings()
prints out all valid encodings to stdout
DLLEXPORT const QoreEncoding * QCS_USASCII
ascii encoding
DLLLOCAL const char * getCode() const
returns the string code (ex: &quot;UTF-8&quot;) for the encoding
Definition: QoreEncoding.h:161
DLLLOCAL qore_size_t getByteLen(const char *p, const char *end, qore_size_t c, bool &invalid) const
gives the number of bytes for the number of chars in the string or up to the end of the string ...
Definition: QoreEncoding.h:114
DLLEXPORT const QoreEncoding * QCS_ISO_8859_10
latin-6, Nordic character set
Qore&#39;s string type supported by the QoreEncoding class.
Definition: QoreString.h:42
DLLEXPORT const QoreEncoding * QCS_KOI8_U
Ukrainian: Kod Obmena Informatsiey, 8 bit.
qore_size_t(* mbcs_length_t)(const char *str, const char *end, bool &invalid)
for multi-byte character set encodings: gives the length of the string in characters ...
Definition: QoreEncoding.h:44
DLLEXPORT QoreEncodingManager QEM
the QoreEncodingManager object
qore_size_t(* mbcs_end_t)(const char *str, const char *end, qore_size_t num_chars, bool &invalid)
for multi-byte character set encodings: gives the number of bytes for the number of chars ...
Definition: QoreEncoding.h:47
DLLEXPORT const QoreEncoding * QCS_ISO_8859_9
latin-5, Turkish character set
static DLLEXPORT void showAliases()
prints out all aliases to stdout
static DLLEXPORT const QoreEncoding * findCreate(const char *name)
finds an encoding if it exists (also looks up against alias names) and creates a new one if it doesn&#39;...
DLLEXPORT const QoreEncoding * QCS_KOI7
Russian: Kod Obmena Informatsiey, 7 bit characters.
qore_size_t(* mbcs_pos_t)(const char *str, const char *ptr, bool &invalid)
for multi-byte character set encodings: gives the character position of the ptr
Definition: QoreEncoding.h:50
container for holding Qore-language exception information and also for registering a &quot;thread_exit&quot; ca...
Definition: ExceptionSink.h:35
DLLEXPORT const QoreEncoding * QCS_ISO_8859_14
latin-8, Celtic character set
DLLLOCAL qore_size_t getCharPos(const char *p, const char *end, bool &invalid) const
gives the character position (number of characters) starting from the first pointer to the second ...
Definition: QoreEncoding.h:133
DLLEXPORT const QoreEncoding * QCS_ISO_8859_6
Arabic character set.
DLLLOCAL bool isMultiByte() const
returns true if the encoding is a multi-byte encoding
Definition: QoreEncoding.h:156
DLLEXPORT qore_size_t q_UTF8_get_char_len(const char *p, qore_size_t valid_len)
returns the length of the next UTF-8 character or 0 for an encoding error or a negative number if the...
DLLEXPORT const QoreEncoding * QCS_ISO_8859_5
Cyrillic character set.
provides a mutually-exclusive thread lock
Definition: QoreThreadLock.h:41
DLLLOCAL qore_size_t getCharLen(const char *p, qore_size_t valid_len) const
gives the number of total bytes for the character given one or more characters
Definition: QoreEncoding.h:151
static DLLEXPORT const QoreEncoding * add(const char *code, const char *desc=0, unsigned char maxwidth=1, mbcs_length_t l=0, mbcs_end_t e=0, mbcs_pos_t p=0, mbcs_charlen_t=0)
adds a new encoding to the list
DLLEXPORT const QoreEncoding * QCS_ISO_8859_16
latin-10, Southeast European character set
DLLEXPORT const QoreEncoding * QCS_ISO_8859_15
latin-9, Western European with euro symbol
DLLEXPORT const QoreEncoding * QCS_KOI8_R
Russian: Kod Obmena Informatsiey, 8 bit.
DLLEXPORT const QoreEncoding * QCS_ISO_8859_7
Greek character set.
DLLEXPORT const QoreEncoding * QCS_ISO_8859_13
latin-7, Baltic rim character set