ACaesar/utf8hack.c

146 lines
4.2 KiB
C
Raw Normal View History

2023-09-04 00:48:08 +00:00
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct utf8_char {
char *bytes;
size_t length;
} utf8_char;
typedef enum {
single_ASCII_byte,
trailing_byte,
head_double_byte,
head_triple_byte,
head_quad_byte,
invalid_utf8_byte,
} utf8_byte_type;
utf8_byte_type get_utf8_byte_type(unsigned char input) {
if ((input & 0b10000000) == 0b10000000) {
if ((input & 0b11000000) == 0b10000000) {
return trailing_byte;
}
if ((input & 0b11100000) == 0b11000000) {
return head_double_byte;
}
if ((input & 0b11110000) == 0b11100000) {
return head_triple_byte;
}
if ((input & 0b11111000) == 0b11110000) {
return head_quad_byte;
}
return invalid_utf8_byte;
}
return single_ASCII_byte;
}
void push_byte_buffer_to_utf8_str(utf8_char **utf8_str, size_t *utf8_str_length, char *byte_buffer, int buf_length) {
size_t current_length = *utf8_str_length;
(*utf8_str)[current_length].bytes = byte_buffer;
(*utf8_str)[current_length].length = buf_length;
*utf8_str_length = current_length + 1;
}
void free_utf8_str(utf8_char *str, int length) {
for (int i = 0; i < length; i++) {
free(str[i].bytes);
}
free(str);
}
int parse_to_utf8(char *input, utf8_char **output_string, size_t *output_length) {
// TODO: 5 levels of indent is too much. Lower it some.
const size_t input_str_length = strlen(input);
utf8_char *output_str = malloc(input_str_length * sizeof(utf8_char));
size_t utf8_str_len = 0;
// TODO: can this be converted back to for loop?
size_t i = 0;
while (i < input_str_length) {
unsigned char current_char = input[i];
utf8_byte_type byte_type = get_utf8_byte_type(current_char);
switch (byte_type) {
case single_ASCII_byte: {
char *byte_buffer = malloc(sizeof(char));
*byte_buffer = current_char;
push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, 1);
i++;
break;
}
case head_double_byte:
case head_triple_byte:
case head_quad_byte: {
size_t j;
char *byte_buffer = malloc(byte_type);
// Insert heading byte into buffer
byte_buffer[0] = input[i];
// Insert following bytes into buffer
for (j = i + 1; j < i + byte_type; j++) {
const unsigned char d_cchar = input[j];
if (d_cchar == '\0')
break;
utf8_byte_type b_type = get_utf8_byte_type(d_cchar);
if (b_type != trailing_byte) {
free_utf8_str(output_str, utf8_str_len);
return 1;
}
byte_buffer[j - i] = d_cchar;
}
i = j;
push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, byte_type);
break;
}
case invalid_utf8_byte:
// Trailing byte without a head byte is invalid, thus should create an error
case trailing_byte:
free_utf8_str(output_str, utf8_str_len);
return 1;
}
}
if (input_str_length != utf8_str_len) {
// Shrink output string to proper size
output_str = realloc(output_str, utf8_str_len);
}
*output_length = utf8_str_len;
*output_string = output_str;
return 0;
}
bool utf8_char_eq(utf8_char *a, utf8_char *b) {
if (a->length != b->length) {
return false;
}
size_t length = a->length;
for (size_t i = 0; i < length; i++) {
if (a->bytes[i] != b->bytes[i]) {
return false;
}
}
return true;
}
void print_utf8_str(utf8_char *str, size_t length) {
for (size_t i = 0; i < length; i++) {
for (size_t j = 0; j < str[i].length; j++) {
putc(str[i].bytes[j], stdout);
}
}
};
utf8_char copy_utf8_char(utf8_char *from) {
utf8_char to_char;
to_char.length = from->length;
to_char.bytes = malloc(sizeof(char) * from->length);
for (size_t i = 0; i < from->length; i++) {
to_char.bytes[i] = from->bytes[i];
}
return to_char;
}