From 6de1915dbefbeb8aefd7db8c76a961bc5d8228b9 Mon Sep 17 00:00:00 2001 From: sBubshait Date: Sat, 15 Jun 2024 01:50:56 +0100 Subject: [PATCH] Restructure overall assembler. Add string_util and Docs --- src/Makefile | 2 +- src/assemble.c | 12 +- src/encode.c | 21 ++- src/parser.c | 447 ++++++++++++++++++++-------------------------- src/parser.h | 17 +- src/string_util.c | 173 ++++++++++++++++++ src/string_util.h | 64 +++++++ src/symboltable.c | 8 + src/symboltable.h | 49 ++++- src/tokeniser.c | 101 ++++------- src/tokeniser.h | 26 +++ 11 files changed, 587 insertions(+), 333 deletions(-) create mode 100644 src/string_util.c create mode 100644 src/string_util.h create mode 100644 src/tokeniser.h diff --git a/src/Makefile b/src/Makefile index 071143f..150b23c 100755 --- a/src/Makefile +++ b/src/Makefile @@ -9,7 +9,7 @@ CFLAGS ?= -std=c17 -g\ all: assemble -assemble: assemble.o parser.o fileio.o +assemble: assemble.o parser.o fileio.o tokeniser.o string_util.o emulate: emulate.o clean: diff --git a/src/assemble.c b/src/assemble.c index 59a22d2..9b2484c 100644 --- a/src/assemble.c +++ b/src/assemble.c @@ -1,3 +1,9 @@ +/** @file assemble.c + * @brief The main file for the ARMv8 assembler. Reads an assembly file and outputs the binary file. + * + * @author Saleh Bubshait + */ + #include #include #include "a64instruction/a64instruction.h" @@ -31,11 +37,13 @@ int main(int argc, char **argv) { // Write the binary to the output file writeBinaryFile(binary, argv[2], lineCount); - /* TODO: FREE MEMORY!! */ - return EXIT_SUCCESS; } +/** The first pass of the assembler. Creates the symbol table. Adds all labels + * and the address of the instruction following the label to the symbol table. + * Returns the final symbol table. + */ static symbol_table *firstPass(a64inst_instruction *instructions, int lineCount) { symbol_table *table = st_init(); int labelCount = 0; diff --git a/src/encode.c b/src/encode.c index 46b84c2..bdb89f4 100644 --- a/src/encode.c +++ b/src/encode.c @@ -1,3 +1,12 @@ +/** @file encode.c + * @brief A function to encode the internal representation of ARMv8 + * instructions, a64inst_instruction, into binary. + * + * @author Ethan Dias Alberto + * @author George Niedringhaus + * @author Saleh Bubshait + */ + #include #include "global.h" #include "a64instruction/a64instruction.h" @@ -53,7 +62,7 @@ static int getLabelOffset(symbol_table* table, char* label, int currentIndex, in } // Generates assembled code based on the two-pass assembly method -word encodeBranch(a64inst_instruction *instr, int index, symbol_table *st) { +static word encodeBranch(a64inst_instruction *instr, int index, symbol_table *st) { word wrd = 0; switch (instr->data.BranchData.BranchType) { @@ -77,7 +86,7 @@ word encodeBranch(a64inst_instruction *instr, int index, symbol_table *st) { return wrd; } -word encodeDPImmediate(a64inst_instruction inst) { +static word encodeDPImmediate(a64inst_instruction inst) { word wrd = 0; a64inst_DPImmediateData data = inst.data.DPImmediateData; @@ -104,7 +113,7 @@ word encodeDPImmediate(a64inst_instruction inst) { return wrd; } -word encodeDPRegister(a64inst_instruction inst) { +static word encodeDPRegister(a64inst_instruction inst) { word wrd = 0; a64inst_DPRegisterData data = inst.data.DPRegisterData; @@ -139,7 +148,7 @@ word encodeDPRegister(a64inst_instruction inst) { } -word encodeSingleDataTransfer(a64inst_instruction inst) { +static word encodeSingleDataTransfer(a64inst_instruction inst) { word wrd = 0; a64inst_SingleTransferData data = inst.data.SingleTransferData; @@ -175,7 +184,7 @@ word encodeSingleDataTransfer(a64inst_instruction inst) { return wrd; } -word encodeLoadLiteral(a64inst_instruction cI, int arrIndex, symbol_table *st) { +static word encodeLoadLiteral(a64inst_instruction cI, int arrIndex, symbol_table *st) { word wrd = 0; a64inst_SingleTransferData data = cI.data.SingleTransferData; @@ -189,7 +198,7 @@ word encodeLoadLiteral(a64inst_instruction cI, int arrIndex, symbol_table *st) { return wrd; } -word *encode(a64inst_instruction insts[], int instCount, symbol_table* st) { +static word *encode(a64inst_instruction insts[], int instCount, symbol_table* st) { word *arr = (word*)malloc(sizeof(word) * instCount); int index = 0; for (int i = 0; i < instCount; i++) { diff --git a/src/parser.c b/src/parser.c index 2985b4c..474652e 100644 --- a/src/parser.c +++ b/src/parser.c @@ -1,24 +1,50 @@ +/** @file parser.c + * @brief Functions to parse ARMv8 assembly lines into an array of a special + * internal representation of instructions, a64inst_instruction. + * @author Ethan Dias Alberto + * @author George Niedringhaus + * @author Saleh Bubshait + */ + #include #include #include #include +#include #include #include "parser.h" #include "a64instruction/a64instruction.h" #include "global.h" -#include "tokeniser.c" +#include "tokeniser.h" +#include "string_util.h" -/** Prototypes */ -void parse_instruction(char asmLine[], a64inst_instruction *instr); -static char *duplicateString(char *str); -void parseSingleTransfer(a64inst_instruction *instr, char *opcode, char *operandList[], int numOperands); -void parseBranch(a64inst_instruction *instr, char* opcode, char *operandList[]); -void calculateAddressFormat(a64inst_instruction *instr, char *operandList[], int numOperands); -void parseDPImmediate(a64inst_instruction *inst, char *tokens[], int tokensCount); -void parseDPRegister(a64inst_instruction *inst, char *tokens[], int tokensCount); -void parseDirective(a64inst_instruction *inst, char *tokens[]); +/************************************ + * STRUCTS + ************************************/ + +typedef struct { + int type; + int immediate; +} ShiftData; + +/************************************ + * PROTOTYPES + ************************************/ + +static void parse_instruction(char asmLine[], a64inst_instruction *instr); +static void parseSingleTransfer(a64inst_instruction *instr, char *opcode, char *operandList[], int numOperands); +static void parseBranch(a64inst_instruction *instr, char* opcode, char *operandList[]); +static void parseAddressingMode(a64inst_instruction *instr, char *operandList[], int numOperands); +static void parseDPImmediate(a64inst_instruction *inst, char *tokens[], int tokensCount); +static void parseDPRegister(a64inst_instruction *inst, char *tokens[], int tokensCount); +static void parseDirective(a64inst_instruction *inst, char *tokens[]); +static ShiftData *parseShift(char *shift); +static void classifyOpcode(char* opcode, a64inst_instruction *instr, char *tokens[], int *tokensCount); + +/************************************ + * CONSTANTS + ************************************/ -/** Constants */ static const char *BRANCH_OPCODES[] = {"b", "br", "b.eq", "b.ne", "b.ge", "b.lt", "b.gt", "b.le", "b.al"}; static const char *SINGLE_TRANSFER_OPCODES[] = {"ldr", "str"}; static const char *WIDE_MOV_OPCODES[] = {"movn", "movz", "movz", "movk"}; @@ -26,9 +52,11 @@ static const char *ARITHMETIC_OPCODES[] = {"add", "adds", "sub", "subs"}; static const char *MULTIPLY_OPCODES[] = {"mul", "madd", "msub", "mneg"}; static const char *SHIFT_TYPE_OPCODES[] = {"lsl", "lsr", "asr", "ror"}; static const char *LOGIC_OPCODES[] = {"and", "ands", "bic", "bics", "eor", "eon", "orr", "orn"}; -static const char *ZERO_REGISTER_ALIAS[] = {"xzr", "wzr"}; -static const char *ALIAS_OPCODES[] = {"cmp", "cmn", "neg", "negs", "tst", "mvn", "mov"}; -static char *ALIAS_TARGET_OPCODES[] = {"subs", "adds", "sub", "subs", "ands", "orn", "orr"}; + + +/************************************ + * FUNCTIONS + ************************************/ a64inst_instruction *parse(char **asmLines, int lineCount) { a64inst_instruction *instructions = malloc(sizeof(a64inst_instruction) * lineCount); @@ -38,180 +66,33 @@ a64inst_instruction *parse(char **asmLines, int lineCount) { parse_instruction(asmLines[i], &instructions[i]); i++; } - + return instructions; } -static char *duplicateString(char *str) { - char *newStr = malloc(strlen(str) + 1); - strcpy(newStr, str); - return newStr; -} - -static bool isStringIn(char *str, const char *arr[], int arrSize) { - for (int i = 0; i < arrSize; i++) { - if (strcmp(str, arr[i]) == 0) { - return true; - } - } - return false; -} - -// If more than one occurance, return the last index -static int indexStringIn(char *str, const char *arr[], int arrSize) { - for (int i = arrSize - 1; i >= 0; i--) { - if (strcmp(str, arr[i]) == 0) { - return i; - } - } - return -1; -} - -typedef struct { - int type; - int immediate; -} ShiftData; - -static ShiftData *parseShift(char *shift) { - char buffer[100]; - strcpy(buffer, shift); - char *shiftType = strtok(buffer, " "); - char *shiftAmount = strtok(NULL, " "); - ShiftData *data = malloc(sizeof(ShiftData)); - data->type = indexStringIn(shiftType, SHIFT_TYPE_OPCODES, 4); - while (*shiftAmount == ' ' || *shiftAmount == '#') { - shiftAmount++; - } - data->immediate = atoi(shiftAmount); - return data; -} - -int isOperandRegister(char regStartChar) { - return((regStartChar == 'x') || (regStartChar == 'w')); -} - -int classifyDPInst(char *operandList[]){ - return(isOperandRegister(operandList[1][0]) && - isOperandRegister(operandList[2][0]) && - isOperandRegister(operandList[3][0])); -} - -void classifyAlias(char *opcode, a64inst_instruction *instr, char *tokens[], int *tokensCount) { - - int aliasIndex = indexStringIn(opcode, ALIAS_OPCODES, 9); - if (aliasIndex != -1) { - // The instruction is one of the aliases, convert into the target. - char *opcode = ALIAS_TARGET_OPCODES[aliasIndex]; - - // To correctly encode the zero register, which is either w31 or x31. - char *start_zeroReg = tokens[1]; - while (isspace(*start_zeroReg)) start_zeroReg++; - char *zeroReg = malloc(5 * sizeof(char)); - *zeroReg = *start_zeroReg; - strcat(zeroReg, "31"); - - switch(aliasIndex) { - case 0: // cmp -> subs rzr, rn, - case 1: // cmn -> adds rzr, rn, - case 4: // tst -> ands rzr, rn, - // Convert from [instr] REG, to [instr] RZR, REG, - tokens[0] = opcode; - tokens[4] = tokens[3]; - tokens[3] = tokens[2]; - tokens[2] = tokens[1]; - tokens[1] = zeroReg; - (*tokensCount)++; - break; - - case 2: // neg -> subs rd, rzr, - case 3: // negs -> subs rd, rzr, - case 5: // mvn -> orn rd, rzr, - case 6: // mov -> orr rd, rzr, rm - tokens[0] = opcode; - tokens[4] = tokens[3]; - tokens[3] = tokens[2]; - tokens[2] = zeroReg; - (*tokensCount)++; - break; - - default: - break; - } - - } -} - -void classifyOpcode(char* opcode, a64inst_instruction *instr, char *tokens[], int *tokensCount){ - - classifyAlias(opcode, instr, tokens, tokensCount); - - if (isStringIn(opcode, BRANCH_OPCODES, 9)) { - instr->type = a64inst_BRANCH; - - if (strcmp(opcode, "br") == 0) { - instr->data.BranchData.BranchType = a64inst_REGISTER; - } else if (strcmp(opcode, "b") == 0) { - instr->data.BranchData.BranchType = a64inst_UNCONDITIONAL; - } else { - instr->data.BranchData.BranchType = a64inst_CONDITIONAL; - } - - } else if (isStringIn(opcode, SINGLE_TRANSFER_OPCODES, 2)) { - instr->type = a64inst_SINGLETRANSFER; - if (*tokens[2] == '[') { - instr->data.SingleTransferData.SingleTransferOpType = a64inst_SINGLE_TRANSFER_SINGLE_DATA_TRANSFER; - instr->data.SingleTransferData.processOpData.singleDataTransferData.transferType = strcmp(opcode, "ldr") == 0; - - } else { - instr->type = a64inst_LOADLITERAL; - } - } else if (classifyDPInst(tokens)) { - instr->type = a64inst_DPREGISTER; - } else { - instr->type = a64inst_DPIMMEDIATE; - } - -} - -//takes inputted char array and returns the integer of the operand, skipping the first character -//e.g. for a passed "R32", it skips the 'R' and returns 32 -int getOperandNumber(char *operand){ - if (isStringIn(operand, ZERO_REGISTER_ALIAS, 2)) { - return ZERO_REGISTER; - } - - char operandCpy[strlen(operand)]; - strcpy(operandCpy, operand+1); - char **endptr = NULL; - int number; - if(strncmp(operandCpy, "0x", 2)==0){ - //hex value - strcpy(operandCpy, operand+3); - number = strtol(operandCpy, endptr, 16); - } else if(operandCpy[0] == 'x'){ - number = strtol(operandCpy+1, endptr, 16); - } else { - number = strtol(operandCpy, endptr, 10); - } - return number; -} - - -void parse_instruction(char asmLine[], a64inst_instruction *instr) { +/** Parses a single ARMv8 assembly line into an a64inst_instruction. + */ +static void parse_instruction(char asmLine[], a64inst_instruction *instr) { if (instr == NULL){ exit(EXIT_FAILURE); } - if(strcmp(asmLine, HALT_ASM_CMD) == 0){ - instr->type = a64inst_HALT; - return; - } - char *asmLineCopy = duplicateString(asmLine); int tokensCount = 0; char **tokens = tokenise(asmLineCopy, &tokensCount); char *opcode = tokens[0]; + // Check if the instruction is the halt instruction, "and x0, x0, x0". + if (tokensCount == 4 && strcmp(opcode, "and") == 0 + && getRegister(tokens[1]) == 0 + && getRegister(tokens[2]) == 0 + && getRegister(tokens[3]) == 0) { + + instr->type = a64inst_HALT; + return; + } + + if(strcmp(opcode, ".int") == 0){ // Directive instr->type = a64inst_DIRECTIVE; @@ -226,6 +107,8 @@ void parse_instruction(char asmLine[], a64inst_instruction *instr) { } else { // Instruction + + // Classify the opcode into the correct instruction type. classifyOpcode(opcode, instr, tokens, &tokensCount); switch(instr->type){ @@ -235,74 +118,32 @@ void parse_instruction(char asmLine[], a64inst_instruction *instr) { case a64inst_SINGLETRANSFER: parseSingleTransfer(instr, opcode, tokens, tokensCount); - calculateAddressFormat(instr, tokens, tokensCount); + parseAddressingMode(instr, tokens, tokensCount); break; + case a64inst_LOADLITERAL: parseSingleTransfer(instr, opcode, tokens, tokensCount); break; + case a64inst_DPREGISTER: //generate DP operands; parseDPRegister(instr, tokens, tokensCount); break; + case a64inst_DPIMMEDIATE: parseDPImmediate(instr, tokens, tokensCount); break; + default: - printf("Error: Invalid Instruction\n"); + printf("Error: Invalid Instruction, '%s'\n", opcode); break; + } } - - /* TODO: FREE MEMORY! */ - } - - - -void calculateAddressFormat(a64inst_instruction *instr, char *tokens[], int tokenCount) { - assert(*tokens[2] == '['); - - int operandCount = 0; - char unsplitString[strlen(tokens[2])]; - strcpy(unsplitString, tokens[2]); - char **operands = tokeniseOperands(tokens[2], &operandCount); - - int baseRegister = getOperandNumber(operands[0]); - - instr->data.SingleTransferData.processOpData.singleDataTransferData.base = baseRegister; - - if (tokenCount >= 4) { - instr->data.SingleTransferData.processOpData.singleDataTransferData.addressingMode = a64inst_POST_INDEXED; - instr->data.SingleTransferData.processOpData.singleDataTransferData.a64inst_addressingModeData.indexedOffset = getOperandNumber(tokens[3]); - - } else if(unsplitString[strlen(unsplitString)-1] == '!') { - instr->data.SingleTransferData.processOpData.singleDataTransferData.addressingMode = a64inst_PRE_INDEXED; - instr->data.SingleTransferData.processOpData.singleDataTransferData.a64inst_addressingModeData.indexedOffset = getOperandNumber(operands[1]); - - } else if (operandCount == 1 || (!isOperandRegister(*operands[1]))) { - instr->data.SingleTransferData.processOpData.singleDataTransferData.addressingMode = a64inst_UNSIGNED_OFFSET; - if(operandCount > 1){ - int offset = getOperandNumber(operands[1]); - instr->data.SingleTransferData.processOpData.singleDataTransferData.a64inst_addressingModeData.unsignedOffset = offset/8; - //NEED TO SCALE IMMEDIATE VALUE BASED ON REGISTER TYPE IN ASSEMBLER - } - } else { - if((isOperandRegister(*operands[0]) == 1) - && (isOperandRegister(*operands[1]) == 1)){ - //register - instr->data.SingleTransferData.processOpData.singleDataTransferData.addressingMode = a64inst_REGISTER_OFFSET; - instr->data.SingleTransferData.processOpData.singleDataTransferData.a64inst_addressingModeData.offsetReg = getOperandNumber(operands[1]); - } - } -} - -static int parseRegisterType(char *operand) { - return operand[0] == 'x'; -} - -void parseDirective(a64inst_instruction *instr, char *tokens[]) { +static void parseDirective(a64inst_instruction *instr, char *tokens[]) { char *intValue = tokens[1]; char *endptr; if(strncmp(intValue, "0x", 2) == 0) { @@ -314,27 +155,28 @@ void parseDirective(a64inst_instruction *instr, char *tokens[]) { } -void parseSingleTransfer(a64inst_instruction *instr, char *opcode, char *tokens[], int tokensCount) { +static void parseSingleTransfer(a64inst_instruction *instr, char *opcode, char *tokens[], int tokensCount) { switch(instr->type){ case a64inst_SINGLETRANSFER: - instr->data.SingleTransferData.regType = parseRegisterType(tokens[1]); - instr->data.SingleTransferData.target = getOperandNumber(tokens[1]); + instr->data.SingleTransferData.regType = getRegisterType(tokens[1]); + instr->data.SingleTransferData.target = getRegister(tokens[1]); break; case a64inst_LOADLITERAL: - instr->data.SingleTransferData.regType = parseRegisterType(tokens[1]); - instr->data.SingleTransferData.target = getOperandNumber(tokens[1]); + instr->data.SingleTransferData.regType = getRegisterType(tokens[1]); + instr->data.SingleTransferData.target = getRegister(tokens[1]); if(*tokens[2] =='#'){ //offset is immediate - int offset = getOperandNumber(tokens[1]); - instr->data.SingleTransferData.processOpData.loadLiteralData.offset = offset; + instr->data.SingleTransferData.processOpData.loadLiteralData.offset = getImmediate(tokens[2]);; } else { + //offset is label instr->data.SingleTransferData.processOpData.loadLiteralData.label = tokens[2]; - //offset is literal, use symbol table and calculate difference } + break; + default: break; @@ -350,7 +192,7 @@ void parseBranch(a64inst_instruction *instr, char* opcode, char *operandList[]) instr->data.BranchData.processOpData.unconditionalData.label = operandList[1]; break; case a64inst_REGISTER: - instr->data.BranchData.processOpData.registerData.src = getOperandNumber(operandList[1]); + instr->data.BranchData.processOpData.registerData.src = getRegister(operandList[1]); break; case a64inst_CONDITIONAL: { @@ -381,13 +223,13 @@ void parseBranch(a64inst_instruction *instr, char* opcode, char *operandList[]) void parseDPImmediate(a64inst_instruction *inst, char *tokens[], int tokensCount) { a64inst_DPImmediateData *data = &inst->data.DPImmediateData; - data->dest = getOperandNumber(tokens[1]); - data->regType = parseRegisterType(tokens[1]); + data->dest = getRegister(tokens[1]); + data->regType = getRegisterType(tokens[1]); - if (isStringIn(tokens[0], WIDE_MOV_OPCODES, 4)) { + if (containsString(tokens[0], WIDE_MOV_OPCODES, 4)) { data->DPIOpType = a64inst_DPI_WIDEMOV; - data->processOp = indexStringIn(tokens[0], WIDE_MOV_OPCODES, 4); - data->processOpData.wideMovData.immediate = getOperandNumber(tokens[2]); + data->processOp = lastIndexOfString(tokens[0], WIDE_MOV_OPCODES, 4); + data->processOpData.wideMovData.immediate = getImmediate(tokens[2]); if (tokensCount >= 4) { ShiftData shData = *parseShift(tokens[3]); data->processOpData.wideMovData.shiftScalar = shData.immediate; @@ -395,9 +237,9 @@ void parseDPImmediate(a64inst_instruction *inst, char *tokens[], int tokensCount } else { data->DPIOpType = a64inst_DPI_ARITHM; - data->processOp = indexStringIn(tokens[0], ARITHMETIC_OPCODES, 4); - data->processOpData.arithmData.src = getOperandNumber(tokens[2]); - data->processOpData.arithmData.immediate = getOperandNumber(tokens[3]); + data->processOp = lastIndexOfString(tokens[0], ARITHMETIC_OPCODES, 4); + data->processOpData.arithmData.src = getRegister(tokens[2]); + data->processOpData.arithmData.immediate = getImmediate(tokens[3]); if (tokensCount >= 5) { ShiftData shData = *parseShift(tokens[4]); @@ -411,16 +253,16 @@ void parseDPImmediate(a64inst_instruction *inst, char *tokens[], int tokensCount void parseDPRegister(a64inst_instruction *inst, char *tokens[], int tokensCount) { a64inst_DPRegisterData *data = &inst->data.DPRegisterData; - data->dest = getOperandNumber(tokens[1]); - data->regType = parseRegisterType(tokens[1]); - data->src1 = getOperandNumber(tokens[2]); - data->src2 = getOperandNumber(tokens[3]); + data->dest = getRegister(tokens[1]); + data->regType = getRegisterType(tokens[1]); + data->src1 = getRegister(tokens[2]); + data->src2 = getRegister(tokens[3]); - if (isStringIn(tokens[0], MULTIPLY_OPCODES, 4)) { + if (containsString(tokens[0], MULTIPLY_OPCODES, 4)) { // Multiply data->DPROpType = a64inst_DPR_MULTIPLY; if (tokensCount >= 5) { - data->processOpData.multiplydata.summand = getOperandNumber(tokens[4]); + data->processOpData.multiplydata.summand = getRegister(tokens[4]); data->processOpData.multiplydata.negProd = strcmp(tokens[0], "msub") == 0; } else { @@ -432,21 +274,21 @@ void parseDPRegister(a64inst_instruction *inst, char *tokens[], int tokensCount) // Arithmetic/Logic data->DPROpType = a64inst_DPR_ARITHMLOGIC; - if (isStringIn(tokens[0], ARITHMETIC_OPCODES, 4)) { + if (containsString(tokens[0], ARITHMETIC_OPCODES, 4)) { // Arithmetic - data->processOp = indexStringIn(tokens[0], ARITHMETIC_OPCODES, 4); + data->processOp = lastIndexOfString(tokens[0], ARITHMETIC_OPCODES, 4); data->processOpData.arithmLogicData.type = 1; if(tokensCount == 5) { //has a shift int numTokens = 0; char **shiftOperands = tokenise(tokens[4], &numTokens); - data->processOpData.arithmLogicData.shiftType = indexStringIn(shiftOperands[0], SHIFT_TYPE_OPCODES, 4); - data->processOpData.arithmLogicData.shiftAmount = getOperandNumber(shiftOperands[1]); + data->processOpData.arithmLogicData.shiftType = lastIndexOfString(shiftOperands[0], SHIFT_TYPE_OPCODES, 4); + data->processOpData.arithmLogicData.shiftAmount = getImmediate(shiftOperands[1]); } } else { // Logic - int opcodeCategory = indexStringIn(tokens[0], LOGIC_OPCODES, 8); + int opcodeCategory = lastIndexOfString(tokens[0], LOGIC_OPCODES, 8); switch(opcodeCategory/2){ case 0: //and @@ -489,9 +331,102 @@ void parseDPRegister(a64inst_instruction *inst, char *tokens[], int tokensCount) //has a shift int numTokens = 0; char **shiftOperands = tokenise(tokens[4], &numTokens); - data->processOpData.arithmLogicData.shiftType = indexStringIn(shiftOperands[0], SHIFT_TYPE_OPCODES, 4); - data->processOpData.arithmLogicData.shiftAmount = getOperandNumber(shiftOperands[1]); + data->processOpData.arithmLogicData.shiftType = lastIndexOfString(shiftOperands[0], SHIFT_TYPE_OPCODES, 4); + data->processOpData.arithmLogicData.shiftAmount = getImmediate(shiftOperands[1]); } } } } + +/** Classifies the given opcode into the correct instruction type. + * Modifies instr to reflect the classification. + */ +static void classifyOpcode(char* opcode, a64inst_instruction *instr, char *tokens[], int *tokensCount) { + + // First, if the opcode is an alias, convert it to the target instruction. + translateAlias(opcode, tokens, tokensCount); + + if (containsString(opcode, BRANCH_OPCODES, 9)) { + instr->type = a64inst_BRANCH; + + if (strcmp(opcode, "br") == 0) { + instr->data.BranchData.BranchType = a64inst_REGISTER; + } else if (strcmp(opcode, "b") == 0) { + instr->data.BranchData.BranchType = a64inst_UNCONDITIONAL; + } else { + instr->data.BranchData.BranchType = a64inst_CONDITIONAL; + } + + } else if (containsString(opcode, SINGLE_TRANSFER_OPCODES, 2)) { + instr->type = a64inst_SINGLETRANSFER; + if (*tokens[2] == '[') { + instr->data.SingleTransferData.SingleTransferOpType = a64inst_SINGLE_TRANSFER_SINGLE_DATA_TRANSFER; + instr->data.SingleTransferData.processOpData.singleDataTransferData.transferType = strcmp(opcode, "ldr") == 0; + + } else { + instr->type = a64inst_LOADLITERAL; + } + + // DP Instruction. + // DP Register if the third operand is a register. + } else if (*tokensCount >= 4 && isRegister(tokens[3])) { + instr->type = a64inst_DPREGISTER; + } else { + instr->type = a64inst_DPIMMEDIATE; + } + +} + +/** Parses a shift string into a ShiftData struct. + */ +static ShiftData *parseShift(char *shift) { + char buffer[20]; + strcpy(buffer, shift); + + char *shiftType = strtok(buffer, " "); + char *shiftAmount = strtok(NULL, " "); + + ShiftData *data = malloc(sizeof(ShiftData)); + + data->type = lastIndexOfString(shiftType, SHIFT_TYPE_OPCODES, 4); + + SKIP_WHITESPACE(shiftAmount); + data->immediate = getImmediate(shiftAmount); + return data; +} + +/** Parses the addressing mode of a single transfer instruction. (Not load literal) + */ +static void parseAddressingMode(a64inst_instruction *instr, char *tokens[], int tokenCount) { + assert(*tokens[2] == '['); + + int operandCount = 0; + char *unsplitString = duplicateString(tokens[2]); + char **operands = tokeniseOperands(tokens[2], &operandCount); + + int baseRegister = getRegister(operands[0]); + + instr->data.SingleTransferData.processOpData.singleDataTransferData.base = baseRegister; + + if (tokenCount >= 4) { + instr->data.SingleTransferData.processOpData.singleDataTransferData.addressingMode = a64inst_POST_INDEXED; + instr->data.SingleTransferData.processOpData.singleDataTransferData.a64inst_addressingModeData.indexedOffset = getImmediate(tokens[3]); + + } else if(unsplitString[strlen(unsplitString)-1] == '!') { + instr->data.SingleTransferData.processOpData.singleDataTransferData.addressingMode = a64inst_PRE_INDEXED; + instr->data.SingleTransferData.processOpData.singleDataTransferData.a64inst_addressingModeData.indexedOffset = getImmediate(operands[1]); + + } else if (operandCount == 1 || (!isRegister(operands[1]))) { + instr->data.SingleTransferData.processOpData.singleDataTransferData.addressingMode = a64inst_UNSIGNED_OFFSET; + if(operandCount > 1){ + int offset = getImmediate(operands[1]); + instr->data.SingleTransferData.processOpData.singleDataTransferData.a64inst_addressingModeData.unsignedOffset = offset/8; + } + } else { + if((isRegister(operands[0]) == 1) + && (isRegister(operands[1]) == 1)){ + instr->data.SingleTransferData.processOpData.singleDataTransferData.addressingMode = a64inst_REGISTER_OFFSET; + instr->data.SingleTransferData.processOpData.singleDataTransferData.a64inst_addressingModeData.offsetReg = getRegister(operands[1]); + } + } +} diff --git a/src/parser.h b/src/parser.h index 81885af..23b76c6 100644 --- a/src/parser.h +++ b/src/parser.h @@ -1,6 +1,17 @@ +/** @file parser.h + * @brief A function to parse ARMv8 assembly lines into an array of a special + * internal representation of instructions, a64inst_instruction. + * + * @author Ethan Dias Alberto + * @author Saleh Bubshait + */ + #include "a64instruction/a64instruction.h" -#define OPERAND_DELIMITER ", " -#define HALT_ASM_CMD "and x0, x0, x0\n" - +/** @brief Parses a list of ARMv8 assembly lines into an array of a64inst_instruction. + * + * @param asmLines An array of strings, each string is an ARMv8 assembly line. + * @param lineCount The number of lines in the asmLines array. + * @return An array of a64inst_instruction representing the parsed instructions. + */ a64inst_instruction *parse(char **asmLines, int lineCount); diff --git a/src/string_util.c b/src/string_util.c new file mode 100644 index 0000000..8b7aaa0 --- /dev/null +++ b/src/string_util.c @@ -0,0 +1,173 @@ +/** @file string_util.c + * @brief This file contains the implementation of some string processing + * utility functions used in the assembler. + * + * @author Saleh Bubshait + */ + +#include +#include +#include +#include +#include "string_util.h" +#include "global.h" + +/************************************ + * CONSTANTS + ************************************/ + +static const char *SPECIAL_REGISTERS[] = {"sp", "xzr", "wzr"}; +static const char *ZERO_REGISTER_ALIAS[] = {"xzr", "wzr"}; +static const char *ALIAS_OPCODES[] = {"cmp", "cmn", "neg", "negs", "tst", "mvn", "mov"}; +static char *ALIAS_TARGET_OPCODES[] = {"subs", "adds", "sub", "subs", "ands", "orn", "orr"}; + +/************************************ + * FUNCTIONS + ************************************/ + +char *trim(char *str) { + // Skip leading whitespace + while (isspace(*str)) { + str++; + } + + // If the string is all whitespace + if (*str == '\0') { + return str; + } + + // Skip trailing whitespace + char *end = str + strlen(str) - 1; + while (end > str && isspace(*end)) { + end--; + } + end[1] = '\0'; + + return str; +} + +bool containsString(char *str, const char *arr[], int arrSize) { + for (int i = 0; i < arrSize; i++) { + if (strcmp(str, arr[i]) == 0) { + return true; + } + } + return false; +} + +int lastIndexOfString(char *str, const char *arr[], int arrSize) { + for (int i = arrSize - 1; i >= 0; i--) { + if (strcmp(str, arr[i]) == 0) { + return i; + } + } + return -1; +} + +char *duplicateString(char *str) { + char *newStr = malloc(strlen(str) + 1); + strcpy(newStr, str); + return newStr; +} + +bool isRegister(char *str) { + SKIP_WHITESPACE(str); + if (str == NULL) + return false; + + if (containsString(str, SPECIAL_REGISTERS, 3)) + return true; + + return tolower(str[0]) == 'x' || tolower(str[0]) == 'w'; +} + +int getRegister(char *str) { + SKIP_WHITESPACE(str); + if (containsString(str, ZERO_REGISTER_ALIAS, 2)) { + return ZERO_REGISTER; + } + + return strtol(str + 1, NULL, 10); +} + +int getImmediate(char *str) { + SKIP_WHITESPACE(str); + if (strlen(str) < 2) { + return 0; + } + + if (str[0] != '#') + return 0; + + str++; // skip # + + if (strncmp(str, "0x", 2) == 0 || strncmp(str, "0X", 3) == 0) { + // Hex + return strtol(str + 2, NULL, 16); + } else { + // Decimal + return strtol(str, NULL, 10); + } + + return 0; +} + +int getRegisterType(char *str) { + SKIP_WHITESPACE(str); + + return tolower(str[0]) == 'x'; +} + + +/** @brief Translates an alias instruction into its target instruction. + * Note: This function modifies the input tokens array and the tokensCount. + * Assumes there is enough space in the tokens array to add the new tokens. + * + * @param opcode The opcode of the instruction. + * @param tokens The tokens of the instruction. + * @param tokensCount The number of tokens in the instruction. + */ +void translateAlias(char *opcode, char *tokens[], int *tokensCount) { + + int aliasIndex = lastIndexOfString(opcode, ALIAS_OPCODES, 9); + if (aliasIndex == -1) + return; + + // The instruction is one of the aliases, convert into the target. + char *targetOpcode = ALIAS_TARGET_OPCODES[aliasIndex]; + + // To correctly encode the zero register, which is either w31 or x31. + char *zeroReg = malloc(5 * sizeof(char)); + *zeroReg = *tokens[1]; + strcat(zeroReg, "31"); + + switch(aliasIndex) { + case 0: // cmp -> subs rzr, rn, + case 1: // cmn -> adds rzr, rn, + case 4: // tst -> ands rzr, rn, + // Convert from [instr] reg, to [instr] rzr, reg, + tokens[0] = targetOpcode; + tokens[4] = tokens[3]; + tokens[3] = tokens[2]; + tokens[2] = tokens[1]; + tokens[1] = zeroReg; + (*tokensCount)++; + break; + + case 2: // neg -> subs rd, rzr, + case 3: // negs -> subs rd, rzr, + case 5: // mvn -> orn rd, rzr, + case 6: // mov -> orr rd, rzr, rm + tokens[0] = targetOpcode; + tokens[4] = tokens[3]; + tokens[3] = tokens[2]; + tokens[2] = zeroReg; + (*tokensCount)++; + break; + + default: + // Note, the multiply instructions are handled separately. + // See DPReg parsing. + break; + } +} diff --git a/src/string_util.h b/src/string_util.h new file mode 100644 index 0000000..c9bca35 --- /dev/null +++ b/src/string_util.h @@ -0,0 +1,64 @@ +/** @file string_util.h + * @brief This file contains the implementation of some string processing + * utility functions used in the assembler. + * + * @author Saleh Bubshait + */ + +/** @brief Skips whitespace characters in a string. + * @param ptr A pointer to the string to skip whitespace in. + */ +#define SKIP_WHITESPACE(ptr) do { while (isspace(*ptr)) { ptr++; } } while (0) + +/** @brief Removes leading and trailing whitespace from a string. + * Note. This function modifies the input string. + * @param str The string to trim. + * @return A pointer to the first non-whitespace character in the string. + */ +char *trim(char *str); + +/** @brief Checks if a string is in an array of strings. + * + * @param str The string to check. + * @param arr The array of strings to check against. + * @param arrSize The size of the array. + * @return True if the string is in the array, false otherwise. + */ +bool containsString(char *str, const char *arr[], int arrSize); + +/** @brief Finds the last index of a string in an array of strings. + * Note: If multiple occurances of the string exist, the index of the last + * occurance is returned! + * + * @param str The string to find. + * @param arr The array of strings to search. + * @param arrSize The size of the array. + * @return The index of the last occurrence of the string in the array, or -1 if not found. + */ +int lastIndexOfString(char *str, const char *arr[], int arrSize); + +/** @brief Duplicates a string. + * Note: The caller is responsible for freeing the returned string. + * + * @param str The string to duplicate. + * @return A pointer to the duplicated string. + */ +char *duplicateString(char *str); + +/** @brief Checks if a string represents an ARMv8 register. + * A string is considered a register if it is: + * - A general purpose register (x0-x30 or w0-w30) + * - A special register (sp, xzr, wzr) + * + * @param str The string to check. + * @return True if the string is a register, false otherwise. + */ +bool isRegister(char *str); + +int getRegister(char *str); + +int getImmediate(char *str); + +int getRegisterType(char *str); + +void translateAlias(char *opcode, char *tokens[], int *tokensCount); diff --git a/src/symboltable.c b/src/symboltable.c index e93c84a..50db150 100644 --- a/src/symboltable.c +++ b/src/symboltable.c @@ -1,3 +1,11 @@ +/** @file symboltable.c + * @brief An Abstract Data Type (ADT) for a symbol table, an array of + * label-address pairs. Labels are strings and addresses are unsigned integers. + * (uint32_t). The symbol table is implemented as a dynamic array. + * + * @author Saleh Bubshait + */ + #include #include #include diff --git a/src/symboltable.h b/src/symboltable.h index ba8b21c..ca1912d 100644 --- a/src/symboltable.h +++ b/src/symboltable.h @@ -1,3 +1,11 @@ +/** @file symboltable.h + * @brief An Abstract Data Type (ADT) for a symbol table, an array of + * label-address pairs. Labels are strings and addresses are unsigned integers. + * (uint32_t). The symbol table is implemented as a dynamic array. + * + * @author Saleh Bubshait + */ + #include #include #include @@ -7,21 +15,56 @@ typedef uint32_t address; +/** An entry in the symbol table, a label-address pair. + */ typedef struct { char *label; address address; } symbol_table_map; +/** The symbol table ADT. + */ typedef struct { - symbol_table_map* table; - int size; - int capacity; + symbol_table_map* table; // entries + int size; // number of entries + int capacity; // size of the table. capacity >= size } symbol_table; +/** @brief Initializes a new symbol table. + * + * @return A pointer to the new symbol table. + */ symbol_table *st_init(void); +/** @brief Inserts a new label-address pair to the symbol table. + * Grows the table if it is full. If the label already exists in the table, + * another entry with the same label is inserted (for performance). + * + * @param st A pointer to the target symbol table. + * @param label The label to insert. + * @param addr The address to insert. + */ void st_insert(symbol_table *st, char *label, address addr); +/** @brief Checks if a label exists in the symbol table. + * + * @param st A pointer to the target symbol table. + * @param label The label to check. + * @return True if the label exists in the table, false otherwise. + */ bool st_contains(symbol_table *st, char *label); +/** @brief Gets the address of a label in the symbol table. + * st_contains should be called before calling this function! + * + * @param st A pointer to the target symbol table. + * @param label The label to get the address of. + * @return The address of the label in the table. + */ address st_get(symbol_table *st, char *label); + +/** @brief Frees the memory allocated for the symbol table. + * + * @param st A pointer to the target symbol table. + */ +void st_free(symbol_table *st); diff --git a/src/tokeniser.c b/src/tokeniser.c index 6e37d1a..3b907ac 100644 --- a/src/tokeniser.c +++ b/src/tokeniser.c @@ -1,33 +1,23 @@ -// Tokeniser.c +/** @file tokeniser.c + * @brief Functions to tokenise lines of assembly and operand strings. + * + * @author Saleh Bubshait + */ + #include #include #include #include #include #include +#include "tokeniser.h" +#include "string_util.h" -#define MAX_TOKEN_COUNT 5 -#define MAX_OPERAND_COUNT 4 +#define MAX_TOKEN_COUNT 6 +#define MAX_OPERAND_COUNT 5 #define OPERAND_DELIMITER ", " - -char *trim(char *str) { - while (isspace(*str)) { - str++; - } - - if (*str == '\0') { - return str; - } - - char *end = str + strlen(str) - 1; - while (end > str && isspace(*end)) { - end--; - } - - end[1] = '\0'; - - return str; -} +#define OPEN_BRACKET '[' +#define CLOSE_BRACKET ']' char **tokenise(char *line, int *numTokens) { char **tokens = malloc(MAX_TOKEN_COUNT * sizeof(char *));\ @@ -46,36 +36,22 @@ char **tokenise(char *line, int *numTokens) { char *operandStart = strtok(NULL, ""); if (operandStart == NULL) { - // No operands. Return the instruction token. + // No operands. Return the first (opcode) token. return tokens; } - bool inBracket = false; - char *currentToken = operandStart; + SKIP_WHITESPACE(operandStart); + + // Use tokeniseOperands to tokenise the operands + int operandTokensCount = 0; + char **operandTokens = tokeniseOperands(operandStart, &operandTokensCount); - for (char *c = operandStart; *c != '\0'; ++c) { - if (*c == '[' || *c == '{') { - inBracket = true; - } else if (*c == ']' || *c == '}') { - inBracket = false; - } - - - if (*c == ',' && !inBracket) { - *c = '\0'; - tokens[(*numTokens)++] = currentToken; - currentToken = c + 1; - while (*currentToken == ' ') { - currentToken++; - } - } - } - - if (*currentToken != '\0') { - tokens[*numTokens] = currentToken; - (*numTokens)++; + for (int i = 0; i < operandTokensCount; i++) { + tokens[(*numTokens)++] = operandTokens[i]; } + + free(operandTokens); return tokens; } @@ -86,42 +62,43 @@ char **tokeniseOperands(char *line, int *numTokens) { exit(EXIT_FAILURE); } - if (*line == '[') { + SKIP_WHITESPACE(line); + + // Remove leading and trailing brackets if they exist + if (*line == OPEN_BRACKET) { line++; // skip '[' - line[strlen(line) - 1] = '\0'; // remove ']' - } else if (*line == '{') { - line++; // skip '{' - line[strlen(line) - 1] = '\0'; // remove '}' + char *end = line + strlen(line) - 1; + while (end > line && *end != CLOSE_BRACKET) { + end--; + } + if (*end == CLOSE_BRACKET) { + *end = '\0'; + } } + line = trim(line); + *numTokens = 0; bool inBracket = false; char *currentToken = line; for (char *c = line; *c != '\0'; ++c) { - if (*c == '[' || *c == '{') { + if (*c == '[') { inBracket = true; - } else if (*c == ']' || *c == '}') { + } else if (*c == ']') { inBracket = false; } if (*c == ',' && !inBracket) { *c = '\0'; tokens[(*numTokens)++] = currentToken; - currentToken = c + 1; - while (*currentToken == ' ') { - currentToken++; - } + currentToken = c + 1; // skip the comma + SKIP_WHITESPACE(currentToken); } } if (*currentToken != '\0') { tokens[*numTokens] = currentToken; - - if (tokens[*numTokens][strlen(tokens[*numTokens]) - 1] == '\n') { - tokens[*numTokens][strlen(tokens[*numTokens]) - 1] = '\0'; - } - (*numTokens)++; } diff --git a/src/tokeniser.h b/src/tokeniser.h new file mode 100644 index 0000000..05b30fa --- /dev/null +++ b/src/tokeniser.h @@ -0,0 +1,26 @@ +/** @file tokeniser.h + * @brief Functions to tokenise lines of assembly and operand strings. + * + * @author Saleh Bubshait + */ + +/** @brief Tokenises a line of assembly code. The first two tokens are separated + * by a space, and the rest are separated by commas. + * e.g., "add x1, x2, x3" -> ["add", "x1", "x2", "x3"]. Handles and skips any + * whitespaces, e.g., " add x1, x2,#4 " -> ["add", "x1", "x2", "#4"]. + * @param line The line to tokenise. + * @param numTokens A pointer to an integer to store the number of tokens. + * @return An array of strings containing the tokens. + */ +char **tokenise(char *line, int *numTokens); + +/** @brief Tokenises the operands of an instruction. The operands are separated + * by commas. Handles and skips any whitespaces, e.g., "x1, x2, #4" -> ["x1", "x2", "#4"]. + * If the line starts with a bracket, it is removed and the closing bracket. + * Note. It also removes anything after the brackets, for example: + * "[x1, x2, #4]!" -> ["x1", "x2", "#4"]. + * @param line The line to tokenise. + * @param numTokens A pointer to an integer to store the number of tokens. + * @return An array of strings containing the tokens. + */ +char **tokeniseOperands(char *line, int *numTokens);