-
Notifications
You must be signed in to change notification settings - Fork 202
/
unicode.c
79 lines (70 loc) · 1.24 KB
/
unicode.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include <stdint.h>
#include <stddef.h>
#include <string.h>
#include "unicode.h"
int utf8_last_size(const char *str) {
int len = 0;
char *pos = strchr(str, '\0');
while (pos > str) {
--pos; ++len;
if ((*pos & 0xc0) != 0x80) {
return len;
}
}
return 0;
}
size_t utf8_chsize(uint32_t ch) {
if (ch < 0x80) {
return 1;
} else if (ch < 0x800) {
return 2;
} else if (ch < 0x10000) {
return 3;
}
return 4;
}
size_t utf8_encode(char *str, uint32_t ch) {
size_t len = 0;
uint8_t first;
if (ch < 0x80) {
first = 0;
len = 1;
} else if (ch < 0x800) {
first = 0xc0;
len = 2;
} else if (ch < 0x10000) {
first = 0xe0;
len = 3;
} else {
first = 0xf0;
len = 4;
}
for (size_t i = len - 1; i > 0; --i) {
str[i] = (ch & 0x3f) | 0x80;
ch >>= 6;
}
str[0] = ch | first;
return len;
}
static const struct {
uint8_t mask;
uint8_t result;
int octets;
} sizes[] = {
{ 0x80, 0x00, 1 },
{ 0xE0, 0xC0, 2 },
{ 0xF0, 0xE0, 3 },
{ 0xF8, 0xF0, 4 },
{ 0xFC, 0xF8, 5 },
{ 0xFE, 0xF8, 6 },
{ 0x80, 0x80, -1 },
};
int utf8_size(const char *s) {
uint8_t c = (uint8_t)*s;
for (size_t i = 0; i < sizeof(sizes) / sizeof(*sizes); ++i) {
if ((c & sizes[i].mask) == sizes[i].result) {
return sizes[i].octets;
}
}
return -1;
}