1
/*
2
 * Copyright 2008-2011 Various Authors
3
 * Copyright 2004-2005 Timo Hirvonen
4
 *
5
 * This program is free software; you can redistribute it and/or
6
 * modify it under the terms of the GNU General Public License as
7
 * published by the Free Software Foundation; either version 2 of the
8
 * License, or (at your option) any later version.
9
 *
10
 * This program is distributed in the hope that it will be useful, but
11
 * WITHOUT ANY WARRANTY; without even the implied warranty of
12
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13
 * General Public License for more details.
14
 *
15
 * You should have received a copy of the GNU General Public License
16
 * along with this program; if not, see <http://www.gnu.org/licenses/>.
17
 */
18
19
#ifndef _UCHAR_H
20
#define _UCHAR_H
21
22
#include <stddef.h> /* size_t */
23
24
typedef unsigned int uchar;
25
26
extern const char hex_tab[16];
27
28
/*
29
 * Invalid bytes are or'ed with this
30
 * for example 0xff -> 0x100000ff
31
 */
32
#define U_INVALID_MASK 0x10000000U
33
34
/*
35
 * @uch  potential unicode character
36
 *
37
 * Returns 1 if @uch is valid unicode character, 0 otherwise
38
 */
39
static inline int u_is_unicode(uchar uch)
40
{
41
	return uch <= 0x0010ffffU;
42
}
43
44
/*
45
 * Returns size of @uch in bytes
46
 */
47
static inline int u_char_size(uchar uch)
48
{
49
	if (uch <= 0x0000007fU) {
50
		return 1;
51
	} else if (uch <= 0x000007ffU) {
52
		return 2;
53
	} else if (uch <= 0x0000ffffU) {
54
		return 3;
55
	} else if (uch <= 0x0010ffffU) {
56
		return 4;
57
	} else {
58
		return 1;
59
	}
60
}
61
62
/*
63
 * Returns width of @uch (normally 1 or 2, 4 for invalid chars (<xx>))
64
 */
65
int u_char_width(uchar uch);
66
67
/*
68
 * @str  any null-terminated string
69
 *
70
 * Returns 1 if @str is valid UTF-8 string, 0 otherwise.
71
 */
72
int u_is_valid(const char *str);
73
74
/*
75
 * @str  valid, null-terminated UTF-8 string
76
 *
77
 * Returns position of next unicode character in @str.
78
 */
79
extern const char * const utf8_skip;
80
static inline char *u_next_char(const char *str)
81
{
82
	return (char *) (str + utf8_skip[*((const unsigned char *) str)]);
83
}
84
85
/*
86
 * @str  valid, null-terminated UTF-8 string
87
 *
88
 * Retuns length of @str in UTF-8 characters.
89
 */
90
size_t u_strlen(const char *str);
91
92
/*
93
 * @str  null-terminated UTF-8 string
94
 *
95
 * Retuns length of @str in UTF-8 characters.
96
 * Invalid chars are counted as single characters.
97
 */
98
size_t u_strlen_safe(const char *str);
99
100
/*
101
 * @str  null-terminated UTF-8 string
102
 *
103
 * Retuns width of @str.
104
 */
105
int u_str_width(const char *str);
106
107
/*
108
 * @str  null-terminated UTF-8 string
109
 * @len  number of characters to measure
110
 *
111
 * Retuns width of the first @len characters in @str.
112
 */
113
int u_str_nwidth(const char *str, int len);
114
115
/*
116
 * @str  null-terminated UTF-8 string
117
 * @uch  unicode character
118
 *
119
 * Returns a pointer to the first occurrence of @uch in the @str.
120
 */
121
char *u_strchr(const char *str, uchar uch);
122
123
void u_prev_char_pos(const char *str, int *idx);
124
125
/*
126
 * @str  null-terminated UTF-8 string
127
 * @idx  pointer to byte index in @str (not UTF-8 character index!) or NULL
128
 *
129
 * Returns unicode character at @str[*@idx] or @str[0] if @idx is NULL.
130
 * Stores byte index of the next char back to @idx if set.
131
 */
132
uchar u_get_char(const char *str, int *idx);
133
134
/*
135
 * @str  destination buffer
136
 * @idx  pointer to byte index in @str (not UTF-8 character index!)
137
 * @uch  unicode character
138
 */
139
void u_set_char_raw(char *str, int *idx, uchar uch);
140
void u_set_char(char *str, int *idx, uchar uch);
141
142
/*
143
 * @dst    destination buffer
144
 * @src    null-terminated UTF-8 string
145
 * @width  how much to copy
146
 *
147
 * Copies at most @count characters, less if null byte was hit.
148
 * Null byte is _never_ copied.
149
 * Actual width of copied characters is stored to @width.
150
 *
151
 * Returns number of _bytes_ copied.
152
 */
153
int u_copy_chars(char *dst, const char *src, int *width);
154
155
/*
156
 * @dst    destination buffer
157
 * @src    null-terminated UTF-8 string
158
 * @len    how many bytes are available in @dst
159
 *
160
 * Copies at most @len bytes, less if null byte was hit. Replaces every
161
 * non-ascii character by '?'. Null byte is _never_ copied.
162
 *
163
 * Returns number of bytes written to @dst.
164
 */
165
int u_to_ascii(char *dst, const char *src, int len);
166
167
/*
168
 * @str    null-terminated UTF-8 string, must be long enough
169
 * @width  how much to skip
170
 *
171
 * Skips @count UTF-8 characters.
172
 * Total width of skipped characters is stored to @width.
173
 * Returned @width can be the given @width + 1 if the last skipped
174
 * character was double width.
175
 *
176
 * Returns number of _bytes_ skipped.
177
 */
178
int u_skip_chars(const char *str, int *width);
179
180
/*
181
 * @str  valid null-terminated UTF-8 string
182
 *
183
 * Converts a string into a form that is independent of case.
184
 *
185
 * Returns a newly allocated string
186
 */
187
char *u_casefold(const char *str);
188
189
/*
190
 * @str1  valid, normalized, null-terminated UTF-8 string
191
 * @str2  valid, normalized, null-terminated UTF-8 string
192
 *
193
 * Returns 1 if @str1 is equal to @str2, ignoring the case of the characters.
194
 */
195
int u_strcase_equal(const char *str1, const char *str2);
196
197
/*
198
 * @str1    valid, normalized, null-terminated UTF-8 string
199
 * @str2    valid, normalized, null-terminated UTF-8 string
200
 * @len  number of characters to consider for comparison
201
 *
202
 * Returns 1 if the first @len characters of @str1 and @str2 are equal,
203
 * ignoring the case of the characters (0 otherwise).
204
 */
205
int u_strncase_equal(const char *str1, const char *str2, size_t len);
206
207
/*
208
 * @str1    valid, normalized, null-terminated UTF-8 string
209
 * @str2    valid, normalized, null-terminated UTF-8 string
210
 * @len  number of characters to consider for comparison
211
 *
212
 * Like u_strncase_equal(), but uses only base characters for comparison
213
 * (e.g. "Trentemöller" matches "Trentemøller")
214
 */
215
int u_strncase_equal_base(const char *str1, const char *str2, size_t len);
216
217
/*
218
 * @haystack  valid, normalized, null-terminated UTF-8 string
219
 * @needle    valid, normalized, null-terminated UTF-8 string
220
 *
221
 * Returns position of @needle in @haystack (case insensitive comparison).
222
 */
223
char *u_strcasestr(const char *haystack, const char *needle);
224
225
/*
226
 * @haystack  valid, normalized, null-terminated UTF-8 string
227
 * @needle    valid, normalized, null-terminated UTF-8 string
228
 *
229
 * Like u_strcasestr(), but uses only base characters for comparison
230
 * (e.g. "Trentemöller" matches "Trentemøller")
231
 */
232
char *u_strcasestr_base(const char *haystack, const char *needle);
233
234
/*
235
 * @haystack  null-terminated string in local encoding
236
 * @needle    valid, normalized, null-terminated UTF-8 string
237
 *
238
 * Like u_strcasestr_base(), but converts @haystack to UTF-8 if necessary.
239
 */
240
char *u_strcasestr_filename(const char *haystack, const char *needle);
241
242
#endif