Initial Commit

2023-09-03 20:48:08 -04:00 · 2023-09-03 20:48:08 -04:00 · 41d574f45f
commit 41d574f45f
7 changed files with 350 additions and 0 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,2 @@
 ColumnLimit: 120
 IndentWidth: 4
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
 acaesar
--- a/README.md
+++ b/README.md
@ -0,0 +1,20 @@
 # ACaesar - Caesar cipher transcoder
 This program encodes and decodes messages using a [Caesar Cipher](https://en.wikipedia.org/wiki/Caesar_cipher).
 ```
 Usage: acaesar [options] message
 Options:
   --help             Print this help message
   -o <offset>        Set the integer offset used in transcoding
   -c <character set> Set the character set
   --xyz              Set the character set to the lowercase alphabet (default)
 ```
 ## Compiling
 The program is fairly simple and thus it should be able to be compiled with any modern C compiler. This was developed using GCC on Ubuntu. A makefile is included which has only one command:
 ```
 gcc acaesar.c utf8hack.c -O3 -Wall -Wextra -o acaesar
 ```
--- a/acaesar.c
+++ b/acaesar.c
@ -0,0 +1,160 @@
 #include "utf8hack.h"
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 int get_index_of_utf8_char(utf8_char *utf8_string, size_t string_length, utf8_char search_char);
 void transcode(long offset, char *charset_str, char *message_str);
 #define HELP_MESSAGE                                                                                                   \
    "ACaesar - Caesar Cipher transcoder version 0.1\n\
 Usage: acaesar [options] message\n\
 Options:\n\
   --help             Print this help message\n\
   -o <offset>        Set the integer offset used in transcoding\n\
   -c <character set> Set the character set\n\
   --xyz              Set the character set to the lowercase alphabet (default)\n"
 int main(int argc, char *argv[]) {
    bool charset_set = false;
    bool message_set = false;
    bool offset_set = false;
    bool alloc_charset = false;
    long offset;
    char *charset;
    char *message;
    // Argument parsing
    for (int i = 1; i < argc; i++) {
        char *current_arg = argv[i];
        // Help flag
        if (strcmp(current_arg, "--help") == 0) {
            printf(HELP_MESSAGE);
            return 0;
        }
        // Charset flag
        if (strcmp(current_arg, "-c") == 0) {
            if (i == argc - 1) {
                fprintf(stderr, "Error, charset (-c) flag found without following charset.\n");
                return 1;
            }
            if (charset_set) {
                fprintf(stderr, "Error, character set specified twice.\n");
                return 1;
            }
            char *next_arg = argv[i + 1];
            size_t size = strlen(next_arg);
            charset = malloc(size * sizeof(char) + 1);
            strcpy(charset, next_arg);
            charset_set = true;
            i++;
            continue;
        }
        // Offset flag
        if (strcmp(current_arg, "-o") == 0) {
            if (i == argc - 1) {
                fprintf(stderr, "Error, offset (-o) flag found without following offset.\n");
                return 1;
            }
            if (offset_set) {
                fprintf(stderr, "Error, offset (-o) specified twice.\n");
                return 1;
            }
            char *next_arg = argv[i + 1];
            offset = strtol(next_arg, NULL, 10);
            offset_set = true;
            i++;
            continue;
        }
        // alphabetical charset flag. (default anyways). Only exists for completeness.
        if (strcmp(current_arg, "--xyz") == 0) {
            if (charset_set) {
                fprintf(stderr, "Error, character set specified twice.\n");
                return 1;
            }
            charset = "abcdefghijklmnopqrstuvwxyz";
            charset_set = true;
            alloc_charset = true;
            continue;
        }
        if (i != argc - 1) {
            // Unrecognized flag
            fprintf(stderr, "Error, unknown argument found: '%s'\n", current_arg);
            return 1;
        } else {
            // Message to be transcoded.
            // Last argument is assumed to be message.
            size_t size = strlen(current_arg);
            message = malloc(size * sizeof(char) + 1);
            strcpy(message, current_arg);
            message_set = true;
        }
    }
    if (!charset_set) {
        // Default to a...z when no charset is found.
        charset = "abcdefghijklmnopqrstuvwxyz";
        charset_set = true;
    }
    if (!offset_set) {
        fprintf(stderr, "Error, no offset found. Use '-o' to set one.\n");
        return 1;
    }
    if (!message_set) {
        fprintf(stderr, "Error, no message found to transcode.\n");
        return 1;
    }
    transcode(offset, charset, message);
    // Clean up
    free(message);
    // In some cases, charset is malloc'd and in others it is set to be a string literal.
    // String literals can not bee freed so `alloc_charset` is used to determine when charset should be freed
    if (alloc_charset)
        free(charset);
    return 0;
 }
 int get_index_of_utf8_char(utf8_char *utf8_string, size_t string_length, utf8_char search_char) {
    for (size_t i = 0; i < string_length; i++) {
        if (utf8_char_eq(&utf8_string[i], &search_char)) {
            return i;
        }
    }
    return -1;
 }
 // Error handling would be nice for this function.
 void transcode(long offset, char *charset_str, char *message_str) {
    size_t charset_length;
    utf8_char *charset;
    parse_to_utf8(charset_str, &charset, &charset_length);
    size_t message_length;
    utf8_char *message;
    parse_to_utf8(message_str, &message, &message_length);
    utf8_char *transcoded_message = malloc(message_length * sizeof(utf8_char));
    for (size_t i = 0; i < message_length; i++) {
        // A hashmap would likely be more efficient for larger messages, but would introduce much complexity.
        int a = get_index_of_utf8_char(charset, charset_length, message[i]);
        if (a != -1) {
            transcoded_message[i] = copy_utf8_char(&charset[(a + offset) % charset_length]);
        } else {
            // If character is not in character set, copy it over unchanged.
            transcoded_message[i] = copy_utf8_char(&message[i]);
        }
    }
    print_utf8_str(transcoded_message, message_length);
    printf("\n");
    free_utf8_str(charset, charset_length);
    free_utf8_str(message, message_length);
    free_utf8_str(transcoded_message, message_length);
 }
--- a/2
+++ b/2
@ -0,0 +1,2 @@
 build:
 	gcc acaesar.c utf8hack.c -O3 -Wall -Wextra -o acaesar
--- a/utf8hack.c
+++ b/utf8hack.c
@ -0,0 +1,146 @@
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 typedef struct utf8_char {
    char *bytes;
    size_t length;
 } utf8_char;
 typedef enum {
    single_ASCII_byte,
    trailing_byte,
    head_double_byte,
    head_triple_byte,
    head_quad_byte,
    invalid_utf8_byte,
 } utf8_byte_type;
 utf8_byte_type get_utf8_byte_type(unsigned char input) {
    if ((input & 0b10000000) == 0b10000000) {
        if ((input & 0b11000000) == 0b10000000) {
            return trailing_byte;
        }
        if ((input & 0b11100000) == 0b11000000) {
            return head_double_byte;
        }
        if ((input & 0b11110000) == 0b11100000) {
            return head_triple_byte;
        }
        if ((input & 0b11111000) == 0b11110000) {
            return head_quad_byte;
        }
        return invalid_utf8_byte;
    }
    return single_ASCII_byte;
 }
 void push_byte_buffer_to_utf8_str(utf8_char **utf8_str, size_t *utf8_str_length, char *byte_buffer, int buf_length) {
    size_t current_length = *utf8_str_length;
    (*utf8_str)[current_length].bytes = byte_buffer;
    (*utf8_str)[current_length].length = buf_length;
    *utf8_str_length = current_length + 1;
 }
 void free_utf8_str(utf8_char *str, int length) {
    for (int i = 0; i < length; i++) {
        free(str[i].bytes);
    }
    free(str);
 }
 int parse_to_utf8(char *input, utf8_char **output_string, size_t *output_length) {
    // TODO: 5 levels of indent is too much. Lower it some.
    const size_t input_str_length = strlen(input);
    utf8_char *output_str = malloc(input_str_length * sizeof(utf8_char));
    size_t utf8_str_len = 0;
    // TODO: can this be converted back to for loop?
    size_t i = 0;
    while (i < input_str_length) {
        unsigned char current_char = input[i];
        utf8_byte_type byte_type = get_utf8_byte_type(current_char);
        switch (byte_type) {
        case single_ASCII_byte: {
            char *byte_buffer = malloc(sizeof(char));
            *byte_buffer = current_char;
            push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, 1);
            i++;
            break;
        }
        case head_double_byte:
        case head_triple_byte:
        case head_quad_byte: {
            size_t j;
            char *byte_buffer = malloc(byte_type);
            // Insert heading byte into buffer
            byte_buffer[0] = input[i];
            // Insert following bytes into buffer
            for (j = i + 1; j < i + byte_type; j++) {
                const unsigned char d_cchar = input[j];
                if (d_cchar == '\0')
                    break;
                utf8_byte_type b_type = get_utf8_byte_type(d_cchar);
                if (b_type != trailing_byte) {
                    free_utf8_str(output_str, utf8_str_len);
                    return 1;
                }
                byte_buffer[j - i] = d_cchar;
            }
            i = j;
            push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, byte_type);
            break;
        }
        case invalid_utf8_byte:
        // Trailing byte without a head byte is invalid, thus should create an error
        case trailing_byte:
            free_utf8_str(output_str, utf8_str_len);
            return 1;
        }
    }
    if (input_str_length != utf8_str_len) {
        // Shrink output string to proper size
        output_str = realloc(output_str, utf8_str_len);
    }
    *output_length = utf8_str_len;
    *output_string = output_str;
    return 0;
 }
 bool utf8_char_eq(utf8_char *a, utf8_char *b) {
    if (a->length != b->length) {
        return false;
    }
    size_t length = a->length;
    for (size_t i = 0; i < length; i++) {
        if (a->bytes[i] != b->bytes[i]) {
            return false;
        }
    }
    return true;
 }
 void print_utf8_str(utf8_char *str, size_t length) {
    for (size_t i = 0; i < length; i++) {
        for (size_t j = 0; j < str[i].length; j++) {
            putc(str[i].bytes[j], stdout);
        }
    }
 };
 utf8_char copy_utf8_char(utf8_char *from) {
    utf8_char to_char;
    to_char.length = from->length;
    to_char.bytes = malloc(sizeof(char) * from->length);
    for (size_t i = 0; i < from->length; i++) {
        to_char.bytes[i] = from->bytes[i];
    }
    return to_char;
 }
--- a/utf8hack.h
+++ b/utf8hack.h
@ -0,0 +1,19 @@
 #ifndef UTF8_HACK
 #define UTF8_HACK
 // hacky implementation of UTF-8 parsing. Doesn't do much validation.
 // Primary purpose is to separate bytes into unicode codepoints.
 #include <stdbool.h>
 #include <stddef.h>
 typedef struct utf8_char {
    char *bytes;
    size_t length;
 } utf8_char;
 bool utf8_char_eq(utf8_char *a, utf8_char *b);
 int parse_to_utf8(char *input, utf8_char **output_string, size_t *output_length);
 void print_utf8_str(utf8_char *str, size_t length);
 void free_utf8_str(utf8_char *str, int length);
 utf8_char copy_utf8_char(utf8_char *from);
 #endif // UTF8_HACK
		`@ -0,0 +1,2 @@`
							`build:`
							`gcc acaesar.c utf8hack.c -O3 -Wall -Wextra -o acaesar`