| #include <stdio.h> |
| |
| /* http://bjoern.hoehrmann.de/utf-8/decoder/dfa */ |
| /* Optimized version based on Rich Felker's variant. */ |
| #define UTF8_ACCEPT 0 |
| #define UTF8_REJECT 12 |
| |
| static const unsigned char utf8d[] = { |
| /* The first part of the table maps bytes to character classes that |
| * to reduce the size of the transition table and create bitmasks. */ |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, |
| 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, |
| 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, |
| 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, |
| 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8 |
| }; |
| /* Note: Splitting the table improves performance on ARM due to its simpler |
| * addressing modes not being able to encode x[y + 256]. */ |
| static const unsigned char utf8s[] = { |
| /* The second part is a transition table that maps a combination |
| * of a state of the automaton and a character class to a state. */ |
| 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, |
| 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, |
| 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, |
| 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, |
| 12,36,12,12,12,12,12,12,12,12,12,12 |
| }; |
| |
| /* Return 0 on success, -1 on error */ |
| int utf8_lookup(const unsigned char *data, int len) |
| { |
| int state = 0; |
| |
| while (len-- && state != UTF8_REJECT) |
| state = utf8s[state + utf8d[*data++]]; |
| |
| return state == UTF8_ACCEPT ? 0 : -1; |
| } |