Qore Programming Language - C/C++ Library  0.8.13.4
QoreEncoding.h
Go to the documentation of this file.
1 /* -*- mode: c++; indent-tabs-mode: nil -*- */
2 /*
3  QoreEncoding.h
4 
5  Qore Programming Language
6 
7  Copyright (C) 2003 - 2017 Qore Technologies, s.r.o.
8 
9  Permission is hereby granted, free of charge, to any person obtaining a
10  copy of this software and associated documentation files (the "Software"),
11  to deal in the Software without restriction, including without limitation
12  the rights to use, copy, modify, merge, publish, distribute, sublicense,
13  and/or sell copies of the Software, and to permit persons to whom the
14  Software is furnished to do so, subject to the following conditions:
15 
16  The above copyright notice and this permission notice shall be included in
17  all copies or substantial portions of the Software.
18 
19  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20  IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21  FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22  AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23  LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
24  FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
25  DEALINGS IN THE SOFTWARE.
26 
27  Note that the Qore library is released under a choice of three open-source
28  licenses: MIT (as above), LGPL 2+, or GPL 2+; see README-LICENSE for more
29  information.
30 */
31 
32 #ifndef _QORE_ENCODING_H
33 
34 #define _QORE_ENCODING_H
35 
41 #include <qore/common.h>
42 #include <qore/QoreThreadLock.h>
43 
44 #include <strings.h>
45 #include <string.h>
46 
47 #include <map>
48 
49 #include <string>
50 
52 typedef qore_size_t (*mbcs_length_t)(const char* str, const char* end, bool &invalid);
53 
55 typedef qore_size_t (*mbcs_end_t)(const char* str, const char* end, qore_size_t num_chars, bool &invalid);
56 
58 typedef qore_size_t (*mbcs_pos_t)(const char* str, const char* ptr, bool &invalid);
59 
61 
65 typedef qore_offset_t (*mbcs_charlen_t)(const char* str, qore_size_t valid_len);
66 
68 typedef unsigned (*mbcs_get_unicode_t)(const char* p);
69 
70 // private implementation of the QoreEncoding class
71 struct qore_encoding_private;
72 
74 
85 class QoreEncoding {
86  friend struct qore_encoding_private;
87 
88 protected:
89  // FIXME: move all this to the private implementation with the ABI change
90  // NOTE: the following class members cannot be removed because until Qore 0.8.12 this class implemented inline member functions
91  // that refered directly to these member variables, therefore they make up a part of the library's ABI :(
92  std::string code;
93  std::string desc;
94  mbcs_length_t flength;
95  mbcs_end_t fend;
96  mbcs_pos_t fpos;
97  mbcs_charlen_t fcharlen;
98  unsigned char maxwidth;
99 
100  qore_encoding_private* priv;
101 
102 public:
103  DLLLOCAL QoreEncoding(const char* n_code, const char* n_desc = 0, unsigned char n_minwidth = 1, unsigned char n_maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t c = 0, mbcs_get_unicode_t gu = 0, bool n_ascii_compat = true);
104 
105  DLLLOCAL ~QoreEncoding();
106 
108 
113  DLLEXPORT qore_size_t getLength(const char* p, const char* end, bool& invalid) const;
114 
116 
121  DLLEXPORT qore_size_t getLength(const char* p, const char* end, ExceptionSink* xsink) const;
122 
124 
130  DLLEXPORT qore_size_t getByteLen(const char* p, const char* end, qore_size_t c, bool& invalid) const;
131 
133 
139  DLLEXPORT qore_size_t getByteLen(const char* p, const char* end, qore_size_t c, ExceptionSink* xsink) const;
140 
142 
147  DLLEXPORT qore_size_t getCharPos(const char* p, const char* end, bool& invalid) const;
148 
150 
155  DLLEXPORT qore_size_t getCharPos(const char* p, const char* end, ExceptionSink* xsink) const;
156 
158 
164  DLLEXPORT qore_offset_t getCharLen(const char* p, qore_size_t valid_len) const;
165 
167  DLLEXPORT bool isMultiByte() const;
168 
170  DLLEXPORT const char* getCode() const;
171 
173  DLLEXPORT const char* getDesc() const;
174 
176  DLLEXPORT int getMaxCharWidth() const;
177 
179 
181  DLLEXPORT unsigned getMinCharWidth() const;
182 
184 
186  DLLEXPORT bool isAsciiCompat() const;
187 
189 
198  DLLEXPORT int getUnicode(const char* p, const char* end, unsigned& clen, ExceptionSink* xsink) const;
199 };
200 
201 // case-insensitive maps for encodings
202 typedef std::map<const char*, QoreEncoding*, ltcstrcase> encoding_map_t;
203 typedef std::map<const char*, const QoreEncoding*, ltcstrcase> const_encoding_map_t;
204 
205 class QoreString;
206 
208 
211 private:
212  DLLLOCAL static encoding_map_t emap;
213  DLLLOCAL static const_encoding_map_t amap;
214  DLLLOCAL static QoreThreadLock mutex;
215 
216  DLLLOCAL static const QoreEncoding* addUnlocked(const char* n_code, const char* n_desc = 0, unsigned char n_minwidth = 1, unsigned char n_maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t c = 0, mbcs_get_unicode_t gu = 0, bool n_ascii_compat = true);
217  DLLLOCAL static const QoreEncoding* findUnlocked(const char* name);
218 
219 public:
221  DLLEXPORT static void addAlias(const QoreEncoding* qcs, const char* alias);
222 
224  DLLEXPORT static const QoreEncoding* findCreate(const char* name);
225 
227  DLLEXPORT static const QoreEncoding* findCreate(const QoreString* str);
228 
230  DLLEXPORT static void showEncodings();
231 
233  DLLEXPORT static void showAliases();
234 
236  DLLEXPORT static const QoreEncoding* add(const char* code, const char* desc = 0, unsigned char maxwidth = 1, mbcs_length_t l = 0, mbcs_end_t e = 0, mbcs_pos_t p = 0, mbcs_charlen_t = 0);
237 
238  DLLLOCAL static void init(const char* def);
239  DLLLOCAL QoreEncodingManager();
240  DLLLOCAL ~QoreEncodingManager();
241 };
242 
243 DLLEXPORT qore_size_t q_get_byte_len(const QoreEncoding* enc, const char* p, const char* end, qore_size_t c, ExceptionSink* xsink);
244 DLLEXPORT qore_offset_t q_get_char_len(const QoreEncoding* enc, const char* p, qore_size_t valid_len, ExceptionSink* xsink);
245 
247 DLLEXPORT extern QoreEncodingManager QEM;
248 
249 // builtin character encodings
250 DLLEXPORT extern const QoreEncoding* QCS_DEFAULT,
251  *QCS_USASCII,
252  *QCS_UTF8,
253  *QCS_UTF16,
254  *QCS_UTF16BE,
255  *QCS_UTF16LE,
256  *QCS_ISO_8859_1,
257  *QCS_ISO_8859_2,
258  *QCS_ISO_8859_3,
259  *QCS_ISO_8859_4,
260  *QCS_ISO_8859_5,
261  *QCS_ISO_8859_6,
262  *QCS_ISO_8859_7,
263  *QCS_ISO_8859_8,
264  *QCS_ISO_8859_9,
265  *QCS_ISO_8859_10,
266  *QCS_ISO_8859_11,
267  *QCS_ISO_8859_13,
268  *QCS_ISO_8859_14,
269  *QCS_ISO_8859_15,
270  *QCS_ISO_8859_16,
271  *QCS_KOI8_R,
272  *QCS_KOI8_U,
273  *QCS_KOI7;
274 
275 #endif // _QORE_ENCODING_H
qore_size_t(* mbcs_end_t)(const char *str, const char *end, qore_size_t num_chars, bool &invalid)
for multi-byte character set encodings: gives the number of bytes for the number of chars ...
Definition: QoreEncoding.h:55
DLLEXPORT const QoreEncoding * QCS_UTF8
UTF-8 multi-byte encoding (only UTF-8 and UTF-16 are multi-byte encodings)
DLLEXPORT qore_size_t getCharPos(const char *p, const char *end, bool &invalid) const
gives the character position (number of characters) starting from the first pointer to the second ...
defines string encoding functions in Qore
Definition: QoreEncoding.h:85
DLLEXPORT bool isMultiByte() const
returns true if the encoding is a multi-byte encoding
DLLEXPORT const QoreEncoding * QCS_ISO_8859_8
Hebrew character set.
DLLEXPORT const QoreEncoding * QCS_ISO_8859_1
latin-1, Western European encoding
DLLEXPORT bool isAsciiCompat() const
returns true if the character encoding is backwards-compatible with ASCII
DLLEXPORT const QoreEncoding * QCS_UTF16
UTF-16 (only UTF-8 and UTF-16* are multi-byte encodings)
DLLEXPORT const QoreEncoding * QCS_DEFAULT
the default encoding for the Qore library
DLLEXPORT const QoreEncoding * QCS_ISO_8859_2
latin-2, Central European encoding
DLLEXPORT const QoreEncoding * QCS_ISO_8859_11
Thai character set.
manages encodings in Qore
Definition: QoreEncoding.h:210
DLLEXPORT const QoreEncoding * QCS_ISO_8859_3
latin-3, Southern European character set
size_t qore_size_t
used for sizes (same range as a pointer)
Definition: common.h:74
DLLEXPORT qore_offset_t getCharLen(const char *p, qore_size_t valid_len) const
gives the number of total bytes for the character given one or more characters
DLLEXPORT const QoreEncoding * QCS_ISO_8859_4
latin-4, Northern European character set
DLLEXPORT const QoreEncoding * QCS_USASCII
ascii encoding
qore_size_t(* mbcs_pos_t)(const char *str, const char *ptr, bool &invalid)
for multi-byte character set encodings: gives the character position of the ptr
Definition: QoreEncoding.h:58
unsigned(* mbcs_get_unicode_t)(const char *p)
returns the unicode code point for the given character, assumes there is enough data for the characte...
Definition: QoreEncoding.h:68
DLLEXPORT const QoreEncoding * QCS_ISO_8859_10
latin-6, Nordic character set
Qore&#39;s string type supported by the QoreEncoding class.
Definition: QoreString.h:82
DLLEXPORT const char * getDesc() const
returns the description for the encoding
DLLEXPORT const QoreEncoding * QCS_KOI8_U
Ukrainian: Kod Obmena Informatsiey, 8 bit.
DLLEXPORT QoreEncodingManager QEM
the QoreEncodingManager object
DLLEXPORT qore_size_t getLength(const char *p, const char *end, bool &invalid) const
gives the length of the string in characters
DLLEXPORT unsigned getMinCharWidth() const
returns the minimum character width in bytes for the encoding
qore_offset_t(* mbcs_charlen_t)(const char *str, qore_size_t valid_len)
for multi-byte encodings: gives the number of total bytes for the character given one or more charact...
Definition: QoreEncoding.h:65
DLLEXPORT const QoreEncoding * QCS_ISO_8859_9
latin-5, Turkish character set
DLLEXPORT const char * getCode() const
returns the string code (ex: "UTF-8") for the encoding
DLLEXPORT int getMaxCharWidth() const
returns the maximum character width in bytes for the encoding
DLLEXPORT const QoreEncoding * QCS_UTF16BE
UTF-16BE (only UTF-8 and UTF-16* are multi-byte encodings)
DLLEXPORT const QoreEncoding * QCS_KOI7
Russian: Kod Obmena Informatsiey, 7 bit characters.
qore_size_t(* mbcs_length_t)(const char *str, const char *end, bool &invalid)
for multi-byte character set encodings: gives the length of the string in characters ...
Definition: QoreEncoding.h:52
DLLEXPORT qore_size_t getByteLen(const char *p, const char *end, qore_size_t c, bool &invalid) const
gives the number of bytes for the number of chars in the string or up to the end of the string ...
container for holding Qore-language exception information and also for registering a "thread_exit" ca...
Definition: ExceptionSink.h:47
DLLEXPORT const QoreEncoding * QCS_ISO_8859_14
latin-8, Celtic character set
DLLEXPORT int getUnicode(const char *p, const char *end, unsigned &clen, ExceptionSink *xsink) const
returns the unicode code point for the given character; if there are any errors (invalid character...
DLLEXPORT const QoreEncoding * QCS_ISO_8859_6
Arabic character set.
intptr_t qore_offset_t
used for offsets that could be negative
Definition: common.h:77
DLLEXPORT const QoreEncoding * QCS_ISO_8859_5
Cyrillic character set.
provides a mutually-exclusive thread lock
Definition: QoreThreadLock.h:49
DLLEXPORT const QoreEncoding * QCS_UTF16LE
UTF-16LE (only UTF-8 and UTF-16* are multi-byte encodings)
DLLEXPORT const QoreEncoding * QCS_ISO_8859_16
latin-10, Southeast European character set
DLLEXPORT const QoreEncoding * QCS_ISO_8859_15
latin-9, Western European with euro symbol
DLLEXPORT const QoreEncoding * QCS_KOI8_R
Russian: Kod Obmena Informatsiey, 8 bit.
DLLEXPORT const QoreEncoding * QCS_ISO_8859_7
Greek character set.
DLLEXPORT const QoreEncoding * QCS_ISO_8859_13
latin-7, Baltic rim character set