146 lines
4.2 KiB
C
146 lines
4.2 KiB
C
|
#include <stdbool.h>
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <string.h>
|
||
|
typedef struct utf8_char {
|
||
|
char *bytes;
|
||
|
size_t length;
|
||
|
} utf8_char;
|
||
|
|
||
|
typedef enum {
|
||
|
single_ASCII_byte,
|
||
|
trailing_byte,
|
||
|
head_double_byte,
|
||
|
head_triple_byte,
|
||
|
head_quad_byte,
|
||
|
invalid_utf8_byte,
|
||
|
} utf8_byte_type;
|
||
|
|
||
|
utf8_byte_type get_utf8_byte_type(unsigned char input) {
|
||
|
|
||
|
if ((input & 0b10000000) == 0b10000000) {
|
||
|
|
||
|
if ((input & 0b11000000) == 0b10000000) {
|
||
|
return trailing_byte;
|
||
|
}
|
||
|
if ((input & 0b11100000) == 0b11000000) {
|
||
|
return head_double_byte;
|
||
|
}
|
||
|
if ((input & 0b11110000) == 0b11100000) {
|
||
|
return head_triple_byte;
|
||
|
}
|
||
|
if ((input & 0b11111000) == 0b11110000) {
|
||
|
return head_quad_byte;
|
||
|
}
|
||
|
return invalid_utf8_byte;
|
||
|
}
|
||
|
return single_ASCII_byte;
|
||
|
}
|
||
|
|
||
|
void push_byte_buffer_to_utf8_str(utf8_char **utf8_str, size_t *utf8_str_length, char *byte_buffer, int buf_length) {
|
||
|
size_t current_length = *utf8_str_length;
|
||
|
(*utf8_str)[current_length].bytes = byte_buffer;
|
||
|
(*utf8_str)[current_length].length = buf_length;
|
||
|
|
||
|
*utf8_str_length = current_length + 1;
|
||
|
}
|
||
|
|
||
|
void free_utf8_str(utf8_char *str, int length) {
|
||
|
for (int i = 0; i < length; i++) {
|
||
|
free(str[i].bytes);
|
||
|
}
|
||
|
free(str);
|
||
|
}
|
||
|
|
||
|
int parse_to_utf8(char *input, utf8_char **output_string, size_t *output_length) {
|
||
|
// TODO: 5 levels of indent is too much. Lower it some.
|
||
|
const size_t input_str_length = strlen(input);
|
||
|
utf8_char *output_str = malloc(input_str_length * sizeof(utf8_char));
|
||
|
size_t utf8_str_len = 0;
|
||
|
|
||
|
// TODO: can this be converted back to for loop?
|
||
|
size_t i = 0;
|
||
|
while (i < input_str_length) {
|
||
|
unsigned char current_char = input[i];
|
||
|
utf8_byte_type byte_type = get_utf8_byte_type(current_char);
|
||
|
|
||
|
switch (byte_type) {
|
||
|
case single_ASCII_byte: {
|
||
|
char *byte_buffer = malloc(sizeof(char));
|
||
|
*byte_buffer = current_char;
|
||
|
push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, 1);
|
||
|
i++;
|
||
|
break;
|
||
|
}
|
||
|
case head_double_byte:
|
||
|
case head_triple_byte:
|
||
|
case head_quad_byte: {
|
||
|
size_t j;
|
||
|
char *byte_buffer = malloc(byte_type);
|
||
|
// Insert heading byte into buffer
|
||
|
byte_buffer[0] = input[i];
|
||
|
// Insert following bytes into buffer
|
||
|
for (j = i + 1; j < i + byte_type; j++) {
|
||
|
const unsigned char d_cchar = input[j];
|
||
|
if (d_cchar == '\0')
|
||
|
break;
|
||
|
|
||
|
utf8_byte_type b_type = get_utf8_byte_type(d_cchar);
|
||
|
if (b_type != trailing_byte) {
|
||
|
free_utf8_str(output_str, utf8_str_len);
|
||
|
return 1;
|
||
|
}
|
||
|
byte_buffer[j - i] = d_cchar;
|
||
|
}
|
||
|
i = j;
|
||
|
push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, byte_type);
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
case invalid_utf8_byte:
|
||
|
// Trailing byte without a head byte is invalid, thus should create an error
|
||
|
case trailing_byte:
|
||
|
free_utf8_str(output_str, utf8_str_len);
|
||
|
return 1;
|
||
|
}
|
||
|
}
|
||
|
if (input_str_length != utf8_str_len) {
|
||
|
// Shrink output string to proper size
|
||
|
output_str = realloc(output_str, utf8_str_len);
|
||
|
}
|
||
|
*output_length = utf8_str_len;
|
||
|
*output_string = output_str;
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
bool utf8_char_eq(utf8_char *a, utf8_char *b) {
|
||
|
if (a->length != b->length) {
|
||
|
return false;
|
||
|
}
|
||
|
size_t length = a->length;
|
||
|
for (size_t i = 0; i < length; i++) {
|
||
|
if (a->bytes[i] != b->bytes[i]) {
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
return true;
|
||
|
}
|
||
|
|
||
|
void print_utf8_str(utf8_char *str, size_t length) {
|
||
|
for (size_t i = 0; i < length; i++) {
|
||
|
for (size_t j = 0; j < str[i].length; j++) {
|
||
|
putc(str[i].bytes[j], stdout);
|
||
|
}
|
||
|
}
|
||
|
};
|
||
|
|
||
|
utf8_char copy_utf8_char(utf8_char *from) {
|
||
|
utf8_char to_char;
|
||
|
|
||
|
to_char.length = from->length;
|
||
|
to_char.bytes = malloc(sizeof(char) * from->length);
|
||
|
for (size_t i = 0; i < from->length; i++) {
|
||
|
to_char.bytes[i] = from->bytes[i];
|
||
|
}
|
||
|
return to_char;
|
||
|
}
|