]> code.delx.au - pulseaudio/blob - src/polypcore/utf8.c
Move xmalloc to the public side (libpolyp).
[pulseaudio] / src / polypcore / utf8.c
1 /* $Id */
2
3 /* This file is based on the GLIB utf8 validation functions. The
4 * original license text follows. */
5
6 /* gutf8.c - Operations on UTF-8 strings.
7 *
8 * Copyright (C) 1999 Tom Tromey
9 * Copyright (C) 2000 Red Hat, Inc.
10 *
11 * This library is free software; you can redistribute it and/or
12 * modify it under the terms of the GNU Lesser General Public
13 * License as published by the Free Software Foundation; either
14 * version 2 of the License, or (at your option) any later version.
15 *
16 * This library is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * Lesser General Public License for more details.
20 *
21 * You should have received a copy of the GNU Lesser General Public
22 * License along with this library; if not, write to the
23 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 * Boston, MA 02111-1307, USA.
25 */
26
27 #ifdef HAVE_CONFIG_H
28 #include <config.h>
29 #endif
30
31 #include <assert.h>
32 #include <stdlib.h>
33 #include <inttypes.h>
34 #include <string.h>
35
36 #include <polyp/xmalloc.h>
37
38 #include "utf8.h"
39
40 #define FILTER_CHAR '_'
41
42 static inline int is_unicode_valid(uint32_t ch) {
43 if (ch >= 0x110000) /* End of unicode space */
44 return 0;
45 if ((ch & 0xFFFFF800) == 0xD800) /* Reserved area for UTF-16 */
46 return 0;
47 if ((ch >= 0xFDD0) && (ch <= 0xFDEF)) /* Reserved */
48 return 0;
49 if ((ch & 0xFFFE) == 0xFFFE) /* BOM (Byte Order Mark) */
50 return 0;
51 return 1;
52 }
53
54 static inline int is_continuation_char(uint8_t ch) {
55 if ((ch & 0xc0) != 0x80) /* 10xxxxxx */
56 return 0;
57 return 1;
58 }
59
60 static inline void merge_continuation_char(uint32_t *u_ch, uint8_t ch) {
61 *u_ch <<= 6;
62 *u_ch |= ch & 0x3f;
63 }
64
65 static char* utf8_validate(const char *str, char *output) {
66 uint32_t val = 0;
67 uint32_t min = 0;
68 const uint8_t *p, *last;
69 int size;
70 uint8_t *o;
71
72 o = (uint8_t*) output;
73 for (p = (const uint8_t*) str; *p; p++) {
74 if (*p < 128) {
75 if (o)
76 *o = *p;
77 } else {
78 last = p;
79
80 if ((*p & 0xe0) == 0xc0) { /* 110xxxxx two-char seq. */
81 size = 2;
82 min = 128;
83 val = *p & 0x1e;
84 goto ONE_REMAINING;
85 } else if ((*p & 0xf0) == 0xe0) { /* 1110xxxx three-char seq.*/
86 size = 3;
87 min = (1 << 11);
88 val = *p & 0x0f;
89 goto TWO_REMAINING;
90 } else if ((*p & 0xf8) == 0xf0) { /* 11110xxx four-char seq */
91 size = 4;
92 min = (1 << 16);
93 val = *p & 0x07;
94 } else {
95 size = 1;
96 goto error;
97 }
98
99 p++;
100 if (!is_continuation_char(*p))
101 goto error;
102 merge_continuation_char(&val, *p);
103
104 TWO_REMAINING:
105 p++;
106 if (!is_continuation_char(*p))
107 goto error;
108 merge_continuation_char(&val, *p);
109
110 ONE_REMAINING:
111 p++;
112 if (!is_continuation_char(*p))
113 goto error;
114 merge_continuation_char(&val, *p);
115
116 if (val < min)
117 goto error;
118
119 if (!is_unicode_valid(val))
120 goto error;
121
122 if (o) {
123 memcpy(o, last, size);
124 o += size - 1;
125 }
126
127 if (o)
128 o++;
129
130 continue;
131
132 error:
133 if (o) {
134 *o = FILTER_CHAR;
135 p = last; /* We retry at the next character */
136 } else
137 goto failure;
138 }
139
140 if (o)
141 o++;
142 }
143
144 if (o) {
145 *o = '\0';
146 return output;
147 }
148
149 return (char*) str;
150
151 failure:
152 return NULL;
153 }
154
155 const char* pa_utf8_valid (const char *str) {
156 return utf8_validate(str, NULL);
157 }
158
159 char* pa_utf8_filter (const char *str) {
160 char *new_str;
161
162 new_str = pa_xnew(char, strlen(str) + 1);
163
164 return utf8_validate(str, new_str);
165 }