Initial Commit

2023-09-03 20:48:08 -04:00 · 2023-09-03 20:48:08 -04:00 · 41d574f45f
commit 41d574f45f
7 changed files with 350 additions and 0 deletions
--- a/.clang-format
+++ b/.clang-format
@ -0,0 +1,2 @@
+ColumnLimit: 120
+IndentWidth: 4
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1 @@
+acaesar
--- a/README.md
+++ b/README.md
@ -0,0 +1,20 @@
+# ACaesar - Caesar cipher transcoder
+
+This program encodes and decodes messages using a [Caesar Cipher](https://en.wikipedia.org/wiki/Caesar_cipher).
+
+```
+Usage: acaesar [options] message
+Options:
+   --help             Print this help message
+   -o <offset>        Set the integer offset used in transcoding
+   -c <character set> Set the character set
+   --xyz              Set the character set to the lowercase alphabet (default)
+```
+
+## Compiling
+
+The program is fairly simple and thus it should be able to be compiled with any modern C compiler. This was developed using GCC on Ubuntu. A makefile is included which has only one command:
+
+```
+gcc acaesar.c utf8hack.c -O3 -Wall -Wextra -o acaesar
+```
--- a/acaesar.c
+++ b/acaesar.c
@ -0,0 +1,160 @@
+
+#include "utf8hack.h"
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+int get_index_of_utf8_char(utf8_char *utf8_string, size_t string_length, utf8_char search_char);
+
+void transcode(long offset, char *charset_str, char *message_str);
+
+#define HELP_MESSAGE                                                                                                   \
+    "ACaesar - Caesar Cipher transcoder version 0.1\n\
+Usage: acaesar [options] message\n\
+Options:\n\
+   --help             Print this help message\n\
+   -o <offset>        Set the integer offset used in transcoding\n\
+   -c <character set> Set the character set\n\
+   --xyz              Set the character set to the lowercase alphabet (default)\n"
+
+int main(int argc, char *argv[]) {
+
+    bool charset_set = false;
+    bool message_set = false;
+    bool offset_set = false;
+    bool alloc_charset = false;
+    long offset;
+    char *charset;
+    char *message;
+
+    // Argument parsing
+    for (int i = 1; i < argc; i++) {
+
+        char *current_arg = argv[i];
+        // Help flag
+        if (strcmp(current_arg, "--help") == 0) {
+            printf(HELP_MESSAGE);
+            return 0;
+        }
+        // Charset flag
+        if (strcmp(current_arg, "-c") == 0) {
+            if (i == argc - 1) {
+                fprintf(stderr, "Error, charset (-c) flag found without following charset.\n");
+                return 1;
+            }
+            if (charset_set) {
+                fprintf(stderr, "Error, character set specified twice.\n");
+                return 1;
+            }
+            char *next_arg = argv[i + 1];
+            size_t size = strlen(next_arg);
+            charset = malloc(size * sizeof(char) + 1);
+            strcpy(charset, next_arg);
+            charset_set = true;
+            i++;
+            continue;
+        }
+        // Offset flag
+        if (strcmp(current_arg, "-o") == 0) {
+            if (i == argc - 1) {
+                fprintf(stderr, "Error, offset (-o) flag found without following offset.\n");
+                return 1;
+            }
+            if (offset_set) {
+                fprintf(stderr, "Error, offset (-o) specified twice.\n");
+                return 1;
+            }
+            char *next_arg = argv[i + 1];
+            offset = strtol(next_arg, NULL, 10);
+            offset_set = true;
+            i++;
+            continue;
+        }
+        // alphabetical charset flag. (default anyways). Only exists for completeness.
+        if (strcmp(current_arg, "--xyz") == 0) {
+            if (charset_set) {
+                fprintf(stderr, "Error, character set specified twice.\n");
+                return 1;
+            }
+            charset = "abcdefghijklmnopqrstuvwxyz";
+            charset_set = true;
+            alloc_charset = true;
+            continue;
+        }
+        if (i != argc - 1) {
+            // Unrecognized flag
+            fprintf(stderr, "Error, unknown argument found: '%s'\n", current_arg);
+            return 1;
+        } else {
+            // Message to be transcoded.
+            // Last argument is assumed to be message.
+            size_t size = strlen(current_arg);
+            message = malloc(size * sizeof(char) + 1);
+            strcpy(message, current_arg);
+            message_set = true;
+        }
+    }
+
+    if (!charset_set) {
+        // Default to a...z when no charset is found.
+        charset = "abcdefghijklmnopqrstuvwxyz";
+        charset_set = true;
+    }
+    if (!offset_set) {
+        fprintf(stderr, "Error, no offset found. Use '-o' to set one.\n");
+        return 1;
+    }
+    if (!message_set) {
+        fprintf(stderr, "Error, no message found to transcode.\n");
+        return 1;
+    }
+    transcode(offset, charset, message);
+
+    // Clean up
+    free(message);
+    // In some cases, charset is malloc'd and in others it is set to be a string literal.
+    // String literals can not bee freed so `alloc_charset` is used to determine when charset should be freed
+    if (alloc_charset)
+        free(charset);
+    return 0;
+}
+
+int get_index_of_utf8_char(utf8_char *utf8_string, size_t string_length, utf8_char search_char) {
+    for (size_t i = 0; i < string_length; i++) {
+        if (utf8_char_eq(&utf8_string[i], &search_char)) {
+            return i;
+        }
+    }
+    return -1;
+}
+
+// Error handling would be nice for this function.
+void transcode(long offset, char *charset_str, char *message_str) {
+    size_t charset_length;
+    utf8_char *charset;
+    parse_to_utf8(charset_str, &charset, &charset_length);
+    size_t message_length;
+    utf8_char *message;
+    parse_to_utf8(message_str, &message, &message_length);
+
+    utf8_char *transcoded_message = malloc(message_length * sizeof(utf8_char));
+
+    for (size_t i = 0; i < message_length; i++) {
+        // A hashmap would likely be more efficient for larger messages, but would introduce much complexity.
+        int a = get_index_of_utf8_char(charset, charset_length, message[i]);
+        if (a != -1) {
+            transcoded_message[i] = copy_utf8_char(&charset[(a + offset) % charset_length]);
+        } else {
+            // If character is not in character set, copy it over unchanged.
+            transcoded_message[i] = copy_utf8_char(&message[i]);
+        }
+    }
+
+    print_utf8_str(transcoded_message, message_length);
+    printf("\n");
+
+    free_utf8_str(charset, charset_length);
+    free_utf8_str(message, message_length);
+    free_utf8_str(transcoded_message, message_length);
+}
--- a/2
+++ b/2
@ -0,0 +1,2 @@
+build:
+	gcc acaesar.c utf8hack.c -O3 -Wall -Wextra -o acaesar
--- a/utf8hack.c
+++ b/utf8hack.c
@ -0,0 +1,146 @@
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+typedef struct utf8_char {
+    char *bytes;
+    size_t length;
+} utf8_char;
+
+typedef enum {
+    single_ASCII_byte,
+    trailing_byte,
+    head_double_byte,
+    head_triple_byte,
+    head_quad_byte,
+    invalid_utf8_byte,
+} utf8_byte_type;
+
+utf8_byte_type get_utf8_byte_type(unsigned char input) {
+
+    if ((input & 0b10000000) == 0b10000000) {
+
+        if ((input & 0b11000000) == 0b10000000) {
+            return trailing_byte;
+        }
+        if ((input & 0b11100000) == 0b11000000) {
+            return head_double_byte;
+        }
+        if ((input & 0b11110000) == 0b11100000) {
+            return head_triple_byte;
+        }
+        if ((input & 0b11111000) == 0b11110000) {
+            return head_quad_byte;
+        }
+        return invalid_utf8_byte;
+    }
+    return single_ASCII_byte;
+}
+
+void push_byte_buffer_to_utf8_str(utf8_char **utf8_str, size_t *utf8_str_length, char *byte_buffer, int buf_length) {
+    size_t current_length = *utf8_str_length;
+    (*utf8_str)[current_length].bytes = byte_buffer;
+    (*utf8_str)[current_length].length = buf_length;
+
+    *utf8_str_length = current_length + 1;
+}
+
+void free_utf8_str(utf8_char *str, int length) {
+    for (int i = 0; i < length; i++) {
+        free(str[i].bytes);
+    }
+    free(str);
+}
+
+int parse_to_utf8(char *input, utf8_char **output_string, size_t *output_length) {
+    // TODO: 5 levels of indent is too much. Lower it some.
+    const size_t input_str_length = strlen(input);
+    utf8_char *output_str = malloc(input_str_length * sizeof(utf8_char));
+    size_t utf8_str_len = 0;
+
+    // TODO: can this be converted back to for loop?
+    size_t i = 0;
+    while (i < input_str_length) {
+        unsigned char current_char = input[i];
+        utf8_byte_type byte_type = get_utf8_byte_type(current_char);
+
+        switch (byte_type) {
+        case single_ASCII_byte: {
+            char *byte_buffer = malloc(sizeof(char));
+            *byte_buffer = current_char;
+            push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, 1);
+            i++;
+            break;
+        }
+        case head_double_byte:
+        case head_triple_byte:
+        case head_quad_byte: {
+            size_t j;
+            char *byte_buffer = malloc(byte_type);
+            // Insert heading byte into buffer
+            byte_buffer[0] = input[i];
+            // Insert following bytes into buffer
+            for (j = i + 1; j < i + byte_type; j++) {
+                const unsigned char d_cchar = input[j];
+                if (d_cchar == '\0')
+                    break;
+
+                utf8_byte_type b_type = get_utf8_byte_type(d_cchar);
+                if (b_type != trailing_byte) {
+                    free_utf8_str(output_str, utf8_str_len);
+                    return 1;
+                }
+                byte_buffer[j - i] = d_cchar;
+            }
+            i = j;
+            push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, byte_type);
+
+            break;
+        }
+        case invalid_utf8_byte:
+        // Trailing byte without a head byte is invalid, thus should create an error
+        case trailing_byte:
+            free_utf8_str(output_str, utf8_str_len);
+            return 1;
+        }
+    }
+    if (input_str_length != utf8_str_len) {
+        // Shrink output string to proper size
+        output_str = realloc(output_str, utf8_str_len);
+    }
+    *output_length = utf8_str_len;
+    *output_string = output_str;
+    return 0;
+}
+
+bool utf8_char_eq(utf8_char *a, utf8_char *b) {
+    if (a->length != b->length) {
+        return false;
+    }
+    size_t length = a->length;
+    for (size_t i = 0; i < length; i++) {
+        if (a->bytes[i] != b->bytes[i]) {
+            return false;
+        }
+    }
+    return true;
+}
+
+void print_utf8_str(utf8_char *str, size_t length) {
+    for (size_t i = 0; i < length; i++) {
+        for (size_t j = 0; j < str[i].length; j++) {
+            putc(str[i].bytes[j], stdout);
+        }
+    }
+};
+
+utf8_char copy_utf8_char(utf8_char *from) {
+    utf8_char to_char;
+
+    to_char.length = from->length;
+    to_char.bytes = malloc(sizeof(char) * from->length);
+    for (size_t i = 0; i < from->length; i++) {
+        to_char.bytes[i] = from->bytes[i];
+    }
+    return to_char;
+}
--- a/utf8hack.h
+++ b/utf8hack.h
@ -0,0 +1,19 @@
+#ifndef UTF8_HACK
+#define UTF8_HACK
+// hacky implementation of UTF-8 parsing. Doesn't do much validation.
+// Primary purpose is to separate bytes into unicode codepoints.
+#include <stdbool.h>
+#include <stddef.h>
+
+typedef struct utf8_char {
+    char *bytes;
+    size_t length;
+} utf8_char;
+
+bool utf8_char_eq(utf8_char *a, utf8_char *b);
+int parse_to_utf8(char *input, utf8_char **output_string, size_t *output_length);
+void print_utf8_str(utf8_char *str, size_t length);
+void free_utf8_str(utf8_char *str, int length);
+utf8_char copy_utf8_char(utf8_char *from);
+
+#endif // UTF8_HACK