#include #include #include #include typedef struct utf8_char { char *bytes; size_t length; } utf8_char; typedef enum { single_ASCII_byte, trailing_byte, head_double_byte, head_triple_byte, head_quad_byte, invalid_utf8_byte, } utf8_byte_type; utf8_byte_type get_utf8_byte_type(unsigned char input) { if ((input & 0b10000000) == 0b10000000) { if ((input & 0b11000000) == 0b10000000) { return trailing_byte; } if ((input & 0b11100000) == 0b11000000) { return head_double_byte; } if ((input & 0b11110000) == 0b11100000) { return head_triple_byte; } if ((input & 0b11111000) == 0b11110000) { return head_quad_byte; } return invalid_utf8_byte; } return single_ASCII_byte; } void push_byte_buffer_to_utf8_str(utf8_char **utf8_str, size_t *utf8_str_length, char *byte_buffer, int buf_length) { size_t current_length = *utf8_str_length; (*utf8_str)[current_length].bytes = byte_buffer; (*utf8_str)[current_length].length = buf_length; *utf8_str_length = current_length + 1; } void free_utf8_str(utf8_char *str, int length) { for (int i = 0; i < length; i++) { free(str[i].bytes); } free(str); } int parse_to_utf8(char *input, utf8_char **output_string, size_t *output_length) { // TODO: 5 levels of indent is too much. Lower it some. const size_t input_str_length = strlen(input); utf8_char *output_str = malloc(input_str_length * sizeof(utf8_char)); size_t utf8_str_len = 0; // TODO: can this be converted back to for loop? size_t i = 0; while (i < input_str_length) { unsigned char current_char = input[i]; utf8_byte_type byte_type = get_utf8_byte_type(current_char); switch (byte_type) { case single_ASCII_byte: { char *byte_buffer = malloc(sizeof(char)); *byte_buffer = current_char; push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, 1); i++; break; } case head_double_byte: case head_triple_byte: case head_quad_byte: { size_t j; char *byte_buffer = malloc(byte_type); // Insert heading byte into buffer byte_buffer[0] = input[i]; // Insert following bytes into buffer for (j = i + 1; j < i + byte_type; j++) { const unsigned char d_cchar = input[j]; if (d_cchar == '\0') break; utf8_byte_type b_type = get_utf8_byte_type(d_cchar); if (b_type != trailing_byte) { free_utf8_str(output_str, utf8_str_len); return 1; } byte_buffer[j - i] = d_cchar; } i = j; push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, byte_type); break; } case invalid_utf8_byte: // Trailing byte without a head byte is invalid, thus should create an error case trailing_byte: free_utf8_str(output_str, utf8_str_len); return 1; } } if (input_str_length != utf8_str_len) { // Shrink output string to proper size output_str = realloc(output_str, utf8_str_len); } *output_length = utf8_str_len; *output_string = output_str; return 0; } bool utf8_char_eq(utf8_char *a, utf8_char *b) { if (a->length != b->length) { return false; } size_t length = a->length; for (size_t i = 0; i < length; i++) { if (a->bytes[i] != b->bytes[i]) { return false; } } return true; } void print_utf8_str(utf8_char *str, size_t length) { for (size_t i = 0; i < length; i++) { for (size_t j = 0; j < str[i].length; j++) { putc(str[i].bytes[j], stdout); } } }; utf8_char copy_utf8_char(utf8_char *from) { utf8_char to_char; to_char.length = from->length; to_char.bytes = malloc(sizeof(char) * from->length); for (size_t i = 0; i < from->length; i++) { to_char.bytes[i] = from->bytes[i]; } return to_char; }