ACaesar/utf8hack.c

#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct utf8_char {
    char *bytes;
    size_t length;
} utf8_char;

typedef enum {
    single_ASCII_byte,
    trailing_byte,
    head_double_byte,
    head_triple_byte,
    head_quad_byte,
    invalid_utf8_byte,
} utf8_byte_type;

utf8_byte_type get_utf8_byte_type(unsigned char input) {

    if ((input & 0b10000000) == 0b10000000) {

        if ((input & 0b11000000) == 0b10000000) {
            return trailing_byte;
        }
        if ((input & 0b11100000) == 0b11000000) {
            return head_double_byte;
        }
        if ((input & 0b11110000) == 0b11100000) {
            return head_triple_byte;
        }
        if ((input & 0b11111000) == 0b11110000) {
            return head_quad_byte;
        }
        return invalid_utf8_byte;
    }
    return single_ASCII_byte;
}

void push_byte_buffer_to_utf8_str(utf8_char **utf8_str, size_t *utf8_str_length, char *byte_buffer, int buf_length) {
    size_t current_length = *utf8_str_length;
    (*utf8_str)[current_length].bytes = byte_buffer;
    (*utf8_str)[current_length].length = buf_length;

    *utf8_str_length = current_length + 1;
}

void free_utf8_str(utf8_char *str, int length) {
    for (int i = 0; i < length; i++) {
        free(str[i].bytes);
    }
    free(str);
}

int parse_to_utf8(char *input, utf8_char **output_string, size_t *output_length) {
    // TODO: 5 levels of indent is too much. Lower it some.
    const size_t input_str_length = strlen(input);
    utf8_char *output_str = malloc(input_str_length * sizeof(utf8_char));
    size_t utf8_str_len = 0;

    // TODO: can this be converted back to for loop?
    size_t i = 0;
    while (i < input_str_length) {
        unsigned char current_char = input[i];
        utf8_byte_type byte_type = get_utf8_byte_type(current_char);

        switch (byte_type) {
        case single_ASCII_byte: {
            char *byte_buffer = malloc(sizeof(char));
            *byte_buffer = current_char;
            push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, 1);
            i++;
            break;
        }
        case head_double_byte:
        case head_triple_byte:
        case head_quad_byte: {
            size_t j;
            char *byte_buffer = malloc(byte_type);
            // Insert heading byte into buffer
            byte_buffer[0] = input[i];
            // Insert following bytes into buffer
            for (j = i + 1; j < i + byte_type; j++) {
                const unsigned char d_cchar = input[j];
                if (d_cchar == '\0')
                    break;

                utf8_byte_type b_type = get_utf8_byte_type(d_cchar);
                if (b_type != trailing_byte) {
                    free_utf8_str(output_str, utf8_str_len);
                    return 1;
                }
                byte_buffer[j - i] = d_cchar;
            }
            i = j;
            push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, byte_type);

            break;
        }
        case invalid_utf8_byte:
        // Trailing byte without a head byte is invalid, thus should create an error
        case trailing_byte:
            free_utf8_str(output_str, utf8_str_len);
            return 1;
        }
    }
    if (input_str_length != utf8_str_len) {
        // Shrink output string to proper size
        output_str = realloc(output_str, utf8_str_len);
    }
    *output_length = utf8_str_len;
    *output_string = output_str;
    return 0;
}

bool utf8_char_eq(utf8_char *a, utf8_char *b) {
    if (a->length != b->length) {
        return false;
    }
    size_t length = a->length;
    for (size_t i = 0; i < length; i++) {
        if (a->bytes[i] != b->bytes[i]) {
            return false;
        }
    }
    return true;
}

void print_utf8_str(utf8_char *str, size_t length) {
    for (size_t i = 0; i < length; i++) {
        for (size_t j = 0; j < str[i].length; j++) {
            putc(str[i].bytes[j], stdout);
        }
    }
};

utf8_char copy_utf8_char(utf8_char *from) {
    utf8_char to_char;

    to_char.length = from->length;
    to_char.bytes = malloc(sizeof(char) * from->length);
    for (size_t i = 0; i < from->length; i++) {
        to_char.bytes[i] = from->bytes[i];
    }
    return to_char;
}
Initial Commit 2023-09-04 00:48:08 +00:00			`#include <stdbool.h>`
			`#include <stdio.h>`
			`#include <stdlib.h>`
			`#include <string.h>`
			`typedef struct utf8_char {`
			`char *bytes;`
			`size_t length;`
			`} utf8_char;`

			`typedef enum {`
			`single_ASCII_byte,`
			`trailing_byte,`
			`head_double_byte,`
			`head_triple_byte,`
			`head_quad_byte,`
			`invalid_utf8_byte,`
			`} utf8_byte_type;`

			`utf8_byte_type get_utf8_byte_type(unsigned char input) {`

			`if ((input & 0b10000000) == 0b10000000) {`

			`if ((input & 0b11000000) == 0b10000000) {`
			`return trailing_byte;`
			`}`
			`if ((input & 0b11100000) == 0b11000000) {`
			`return head_double_byte;`
			`}`
			`if ((input & 0b11110000) == 0b11100000) {`
			`return head_triple_byte;`
			`}`
			`if ((input & 0b11111000) == 0b11110000) {`
			`return head_quad_byte;`
			`}`
			`return invalid_utf8_byte;`
			`}`
			`return single_ASCII_byte;`
			`}`

			`void push_byte_buffer_to_utf8_str(utf8_char *utf8_str, size_t utf8_str_length, char *byte_buffer, int buf_length) {`
			`size_t current_length = *utf8_str_length;`
			`(*utf8_str)[current_length].bytes = byte_buffer;`
			`(*utf8_str)[current_length].length = buf_length;`

			`*utf8_str_length = current_length + 1;`
			`}`

			`void free_utf8_str(utf8_char *str, int length) {`
			`for (int i = 0; i < length; i++) {`
			`free(str[i].bytes);`
			`}`
			`free(str);`
			`}`

			`int parse_to_utf8(char input, utf8_char output_string, size_t output_length) {`
			`// TODO: 5 levels of indent is too much. Lower it some.`
			`const size_t input_str_length = strlen(input);`
			`utf8_char output_str = malloc(input_str_length sizeof(utf8_char));`
			`size_t utf8_str_len = 0;`

			`// TODO: can this be converted back to for loop?`
			`size_t i = 0;`
			`while (i < input_str_length) {`
			`unsigned char current_char = input[i];`
			`utf8_byte_type byte_type = get_utf8_byte_type(current_char);`

			`switch (byte_type) {`
			`case single_ASCII_byte: {`
			`char *byte_buffer = malloc(sizeof(char));`
			`*byte_buffer = current_char;`
			`push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, 1);`
			`i++;`
			`break;`
			`}`
			`case head_double_byte:`
			`case head_triple_byte:`
			`case head_quad_byte: {`
			`size_t j;`
			`char *byte_buffer = malloc(byte_type);`
			`// Insert heading byte into buffer`
			`byte_buffer[0] = input[i];`
			`// Insert following bytes into buffer`
			`for (j = i + 1; j < i + byte_type; j++) {`
			`const unsigned char d_cchar = input[j];`
			`if (d_cchar == '\0')`
			`break;`

			`utf8_byte_type b_type = get_utf8_byte_type(d_cchar);`
			`if (b_type != trailing_byte) {`
			`free_utf8_str(output_str, utf8_str_len);`
			`return 1;`
			`}`
			`byte_buffer[j - i] = d_cchar;`
			`}`
			`i = j;`
			`push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, byte_type);`

			`break;`
			`}`
			`case invalid_utf8_byte:`
			`// Trailing byte without a head byte is invalid, thus should create an error`
			`case trailing_byte:`
			`free_utf8_str(output_str, utf8_str_len);`
			`return 1;`
			`}`
			`}`
			`if (input_str_length != utf8_str_len) {`
			`// Shrink output string to proper size`
			`output_str = realloc(output_str, utf8_str_len);`
			`}`
			`*output_length = utf8_str_len;`
			`*output_string = output_str;`
			`return 0;`
			`}`

			`bool utf8_char_eq(utf8_char a, utf8_char b) {`
			`if (a->length != b->length) {`
			`return false;`
			`}`
			`size_t length = a->length;`
			`for (size_t i = 0; i < length; i++) {`
			`if (a->bytes[i] != b->bytes[i]) {`
			`return false;`
			`}`
			`}`
			`return true;`
			`}`

			`void print_utf8_str(utf8_char *str, size_t length) {`
			`for (size_t i = 0; i < length; i++) {`
			`for (size_t j = 0; j < str[i].length; j++) {`
			`putc(str[i].bytes[j], stdout);`
			`}`
			`}`
			`};`

			`utf8_char copy_utf8_char(utf8_char *from) {`
			`utf8_char to_char;`

			`to_char.length = from->length;`
			`to_char.bytes = malloc(sizeof(char) * from->length);`
			`for (size_t i = 0; i < from->length; i++) {`
			`to_char.bytes[i] = from->bytes[i];`
			`}`
			`return to_char;`
			`}`