]> code.delx.au - pulseaudio/blob - src/polyp/utf8.c
33fa7214e0aedb409a61b9c6a3fac0ff75466062
[pulseaudio] / src / polyp / utf8.c
1 /* $Id$ */
2
3 /* This file is based on the GLIB utf8 validation functions. The
4 * original license text follows. */
5
6 /* gutf8.c - Operations on UTF-8 strings.
7 *
8 * Copyright (C) 1999 Tom Tromey
9 * Copyright (C) 2000 Red Hat, Inc.
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 02111-1307, USA.
25 */
26
27 #ifdef HAVE_CONFIG_H
28 #include <config.h>
29 #endif
30
31 #include <assert.h>
32 #include <errno.h>
33 #include <stdlib.h>
34 #include <inttypes.h>
35 #include <string.h>
36
37 #ifdef HAVE_ICONV
38 #include <iconv.h>
39 #endif
40
41 #include "utf8.h"
42 #include "xmalloc.h"
43
44 #define FILTER_CHAR '_'
45
46 static inline int is_unicode_valid(uint32_t ch) {
47 if (ch >= 0x110000) /* End of unicode space */
48 return 0;
49 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
50 return 0;
51 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
52 return 0;
53 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
54 return 0;
55 return 1;
56 }
57
58 static inline int is_continuation_char(uint8_t ch) {
59 if ((ch & 0xc0) != 0x80) /* 10xxxxxx */
60 return 0;
61 return 1;
62 }
63
64 static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
65 *u_ch <<= 6;
66 *u_ch |= ch & 0x3f;
67 }
68
69 static char* utf8_validate(const char *str, char *output) {
70 uint32_t val = 0;
71 uint32_t min = 0;
72 const uint8_t *p, *last;
73 int size;
74 uint8_t *o;
75
76 o = (uint8_t*) output;
77 for (p = (const uint8_t*) str; *p; p++) {
78 if (*p < 128) {
79 if (o)
80 *o = *p;
81 } else {
82 last = p;
83
84 if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
85 size = 2;
86 min = 128;
87 val = *p & 0x1e;
88 goto ONE_REMAINING;
89 } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
90 size = 3;
91 min = (1 << 11);
92 val = *p & 0x0f;
93 goto TWO_REMAINING;
94 } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
95 size = 4;
96 min = (1 << 16);
97 val = *p & 0x07;
98 } else {
99 size = 1;
100 goto error;
101 }
102
103 p++;
104 if (!is_continuation_char(*p))
105 goto error;
106 merge_continuation_char(&val, *p);
107
108 TWO_REMAINING:
109 p++;
110 if (!is_continuation_char(*p))
111 goto error;
112 merge_continuation_char(&val, *p);
113
114 ONE_REMAINING:
115 p++;
116 if (!is_continuation_char(*p))
117 goto error;
118 merge_continuation_char(&val, *p);
119
120 if (val < min)
121 goto error;
122
123 if (!is_unicode_valid(val))
124 goto error;
125
126 if (o) {
127 memcpy(o, last, size);
128 o += size - 1;
129 }
130
131 if (o)
132 o++;
133
134 continue;
135
136 error:
137 if (o) {
138 *o = FILTER_CHAR;
139 p = last; /* We retry at the next character */
140 } else
141 goto failure;
142 }
143
144 if (o)
145 o++;
146 }
147
148 if (o) {
149 *o = '\0';
150 return output;
151 }
152
153 return (char*) str;
154
155 failure:
156 return NULL;
157 }
158
159 const char* pa_utf8_valid (const char *str) {
160 return utf8_validate(str, NULL);
161 }
162
163 char* pa_utf8_filter (const char *str) {
164 char *new_str;
165
166 new_str = pa_xnew(char, strlen(str) + 1);
167
168 return utf8_validate(str, new_str);
169 }
170
171 #ifdef HAVE_ICONV
172
173 static char* iconv_simple(const char *str, const char *to, const char *from) {
174 char *new_str;
175 size_t len, inlen;
176
177 iconv_t cd;
178 ICONV_CONST char *inbuf;
179 char *outbuf;
180 size_t res, inbytes, outbytes;
181
182 cd = iconv_open(to, from);
183 if (cd == (iconv_t)-1)
184 return NULL;
185
186 inlen = len = strlen(str) + 1;
187 new_str = pa_xmalloc(len);
188 assert(new_str);
189
190 while (1) {
191 inbuf = (ICONV_CONST char*)str; /* Brain dead prototype for iconv() */
192 inbytes = inlen;
193 outbuf = new_str;
194 outbytes = len;
195
196 res = iconv(cd, &inbuf, &inbytes, &outbuf, &outbytes);
197
198 if (res != (size_t)-1)
199 break;
200
201 if (errno != E2BIG) {
202 pa_xfree(new_str);
203 new_str = NULL;
204 break;
205 }
206
207 assert(inbytes != 0);
208
209 len += inbytes;
210 new_str = pa_xrealloc(new_str, len);
211 assert(new_str);
212 }
213
214 iconv_close(cd);
215
216 return new_str;
217 }
218
219 char* pa_utf8_to_locale (const char *str) {
220 return iconv_simple(str, "", "UTF-8");
221 }
222
223 char* pa_locale_to_utf8 (const char *str) {
224 return iconv_simple(str, "UTF-8", "");
225 }
226
227 #else
228
229 char* pa_utf8_to_locale (const char *str) {
230 return NULL;
231 }
232
233 char* pa_locale_to_utf8 (const char *str) {
234 return NULL;
235 }
236
237 #endif