Initial Commit

This commit is contained in:
Alexander Bass 2023-09-03 20:48:08 -04:00
commit 41d574f45f
7 changed files with 350 additions and 0 deletions

2
.clang-format Executable file
View file

@ -0,0 +1,2 @@
ColumnLimit: 120
IndentWidth: 4

1
.gitignore vendored Normal file
View file

@ -0,0 +1 @@
acaesar

20
README.md Normal file
View file

@ -0,0 +1,20 @@
# ACaesar - Caesar cipher transcoder
This program encodes and decodes messages using a [Caesar Cipher](https://en.wikipedia.org/wiki/Caesar_cipher).
```
Usage: acaesar [options] message
Options:
--help Print this help message
-o <offset> Set the integer offset used in transcoding
-c <character set> Set the character set
--xyz Set the character set to the lowercase alphabet (default)
```
## Compiling
The program is fairly simple and thus it should be able to be compiled with any modern C compiler. This was developed using GCC on Ubuntu. A makefile is included which has only one command:
```
gcc acaesar.c utf8hack.c -O3 -Wall -Wextra -o acaesar
```

160
acaesar.c Executable file
View file

@ -0,0 +1,160 @@
#include "utf8hack.h"
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
int get_index_of_utf8_char(utf8_char *utf8_string, size_t string_length, utf8_char search_char);
void transcode(long offset, char *charset_str, char *message_str);
#define HELP_MESSAGE \
"ACaesar - Caesar Cipher transcoder version 0.1\n\
Usage: acaesar [options] message\n\
Options:\n\
--help Print this help message\n\
-o <offset> Set the integer offset used in transcoding\n\
-c <character set> Set the character set\n\
--xyz Set the character set to the lowercase alphabet (default)\n"
int main(int argc, char *argv[]) {
bool charset_set = false;
bool message_set = false;
bool offset_set = false;
bool alloc_charset = false;
long offset;
char *charset;
char *message;
// Argument parsing
for (int i = 1; i < argc; i++) {
char *current_arg = argv[i];
// Help flag
if (strcmp(current_arg, "--help") == 0) {
printf(HELP_MESSAGE);
return 0;
}
// Charset flag
if (strcmp(current_arg, "-c") == 0) {
if (i == argc - 1) {
fprintf(stderr, "Error, charset (-c) flag found without following charset.\n");
return 1;
}
if (charset_set) {
fprintf(stderr, "Error, character set specified twice.\n");
return 1;
}
char *next_arg = argv[i + 1];
size_t size = strlen(next_arg);
charset = malloc(size * sizeof(char) + 1);
strcpy(charset, next_arg);
charset_set = true;
i++;
continue;
}
// Offset flag
if (strcmp(current_arg, "-o") == 0) {
if (i == argc - 1) {
fprintf(stderr, "Error, offset (-o) flag found without following offset.\n");
return 1;
}
if (offset_set) {
fprintf(stderr, "Error, offset (-o) specified twice.\n");
return 1;
}
char *next_arg = argv[i + 1];
offset = strtol(next_arg, NULL, 10);
offset_set = true;
i++;
continue;
}
// alphabetical charset flag. (default anyways). Only exists for completeness.
if (strcmp(current_arg, "--xyz") == 0) {
if (charset_set) {
fprintf(stderr, "Error, character set specified twice.\n");
return 1;
}
charset = "abcdefghijklmnopqrstuvwxyz";
charset_set = true;
alloc_charset = true;
continue;
}
if (i != argc - 1) {
// Unrecognized flag
fprintf(stderr, "Error, unknown argument found: '%s'\n", current_arg);
return 1;
} else {
// Message to be transcoded.
// Last argument is assumed to be message.
size_t size = strlen(current_arg);
message = malloc(size * sizeof(char) + 1);
strcpy(message, current_arg);
message_set = true;
}
}
if (!charset_set) {
// Default to a...z when no charset is found.
charset = "abcdefghijklmnopqrstuvwxyz";
charset_set = true;
}
if (!offset_set) {
fprintf(stderr, "Error, no offset found. Use '-o' to set one.\n");
return 1;
}
if (!message_set) {
fprintf(stderr, "Error, no message found to transcode.\n");
return 1;
}
transcode(offset, charset, message);
// Clean up
free(message);
// In some cases, charset is malloc'd and in others it is set to be a string literal.
// String literals can not bee freed so `alloc_charset` is used to determine when charset should be freed
if (alloc_charset)
free(charset);
return 0;
}
int get_index_of_utf8_char(utf8_char *utf8_string, size_t string_length, utf8_char search_char) {
for (size_t i = 0; i < string_length; i++) {
if (utf8_char_eq(&utf8_string[i], &search_char)) {
return i;
}
}
return -1;
}
// Error handling would be nice for this function.
void transcode(long offset, char *charset_str, char *message_str) {
size_t charset_length;
utf8_char *charset;
parse_to_utf8(charset_str, &charset, &charset_length);
size_t message_length;
utf8_char *message;
parse_to_utf8(message_str, &message, &message_length);
utf8_char *transcoded_message = malloc(message_length * sizeof(utf8_char));
for (size_t i = 0; i < message_length; i++) {
// A hashmap would likely be more efficient for larger messages, but would introduce much complexity.
int a = get_index_of_utf8_char(charset, charset_length, message[i]);
if (a != -1) {
transcoded_message[i] = copy_utf8_char(&charset[(a + offset) % charset_length]);
} else {
// If character is not in character set, copy it over unchanged.
transcoded_message[i] = copy_utf8_char(&message[i]);
}
}
print_utf8_str(transcoded_message, message_length);
printf("\n");
free_utf8_str(charset, charset_length);
free_utf8_str(message, message_length);
free_utf8_str(transcoded_message, message_length);
}

2
makefile Normal file
View file

@ -0,0 +1,2 @@
build:
gcc acaesar.c utf8hack.c -O3 -Wall -Wextra -o acaesar

146
utf8hack.c Normal file
View file

@ -0,0 +1,146 @@
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
typedef struct utf8_char {
char *bytes;
size_t length;
} utf8_char;
typedef enum {
single_ASCII_byte,
trailing_byte,
head_double_byte,
head_triple_byte,
head_quad_byte,
invalid_utf8_byte,
} utf8_byte_type;
utf8_byte_type get_utf8_byte_type(unsigned char input) {
if ((input & 0b10000000) == 0b10000000) {
if ((input & 0b11000000) == 0b10000000) {
return trailing_byte;
}
if ((input & 0b11100000) == 0b11000000) {
return head_double_byte;
}
if ((input & 0b11110000) == 0b11100000) {
return head_triple_byte;
}
if ((input & 0b11111000) == 0b11110000) {
return head_quad_byte;
}
return invalid_utf8_byte;
}
return single_ASCII_byte;
}
void push_byte_buffer_to_utf8_str(utf8_char **utf8_str, size_t *utf8_str_length, char *byte_buffer, int buf_length) {
size_t current_length = *utf8_str_length;
(*utf8_str)[current_length].bytes = byte_buffer;
(*utf8_str)[current_length].length = buf_length;
*utf8_str_length = current_length + 1;
}
void free_utf8_str(utf8_char *str, int length) {
for (int i = 0; i < length; i++) {
free(str[i].bytes);
}
free(str);
}
int parse_to_utf8(char *input, utf8_char **output_string, size_t *output_length) {
// TODO: 5 levels of indent is too much. Lower it some.
const size_t input_str_length = strlen(input);
utf8_char *output_str = malloc(input_str_length * sizeof(utf8_char));
size_t utf8_str_len = 0;
// TODO: can this be converted back to for loop?
size_t i = 0;
while (i < input_str_length) {
unsigned char current_char = input[i];
utf8_byte_type byte_type = get_utf8_byte_type(current_char);
switch (byte_type) {
case single_ASCII_byte: {
char *byte_buffer = malloc(sizeof(char));
*byte_buffer = current_char;
push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, 1);
i++;
break;
}
case head_double_byte:
case head_triple_byte:
case head_quad_byte: {
size_t j;
char *byte_buffer = malloc(byte_type);
// Insert heading byte into buffer
byte_buffer[0] = input[i];
// Insert following bytes into buffer
for (j = i + 1; j < i + byte_type; j++) {
const unsigned char d_cchar = input[j];
if (d_cchar == '\0')
break;
utf8_byte_type b_type = get_utf8_byte_type(d_cchar);
if (b_type != trailing_byte) {
free_utf8_str(output_str, utf8_str_len);
return 1;
}
byte_buffer[j - i] = d_cchar;
}
i = j;
push_byte_buffer_to_utf8_str(&output_str, &utf8_str_len, byte_buffer, byte_type);
break;
}
case invalid_utf8_byte:
// Trailing byte without a head byte is invalid, thus should create an error
case trailing_byte:
free_utf8_str(output_str, utf8_str_len);
return 1;
}
}
if (input_str_length != utf8_str_len) {
// Shrink output string to proper size
output_str = realloc(output_str, utf8_str_len);
}
*output_length = utf8_str_len;
*output_string = output_str;
return 0;
}
bool utf8_char_eq(utf8_char *a, utf8_char *b) {
if (a->length != b->length) {
return false;
}
size_t length = a->length;
for (size_t i = 0; i < length; i++) {
if (a->bytes[i] != b->bytes[i]) {
return false;
}
}
return true;
}
void print_utf8_str(utf8_char *str, size_t length) {
for (size_t i = 0; i < length; i++) {
for (size_t j = 0; j < str[i].length; j++) {
putc(str[i].bytes[j], stdout);
}
}
};
utf8_char copy_utf8_char(utf8_char *from) {
utf8_char to_char;
to_char.length = from->length;
to_char.bytes = malloc(sizeof(char) * from->length);
for (size_t i = 0; i < from->length; i++) {
to_char.bytes[i] = from->bytes[i];
}
return to_char;
}

19
utf8hack.h Normal file
View file

@ -0,0 +1,19 @@
#ifndef UTF8_HACK
#define UTF8_HACK
// hacky implementation of UTF-8 parsing. Doesn't do much validation.
// Primary purpose is to separate bytes into unicode codepoints.
#include <stdbool.h>
#include <stddef.h>
typedef struct utf8_char {
char *bytes;
size_t length;
} utf8_char;
bool utf8_char_eq(utf8_char *a, utf8_char *b);
int parse_to_utf8(char *input, utf8_char **output_string, size_t *output_length);
void print_utf8_str(utf8_char *str, size_t length);
void free_utf8_str(utf8_char *str, int length);
utf8_char copy_utf8_char(utf8_char *from);
#endif // UTF8_HACK