commit 10fbaf9a7255d97f1031caf7776e662b3f063817 Author: Augusto Gunsch Date: Mon Nov 30 18:44:22 2020 -0300 Add tokenizer diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..68b6d45 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,3 @@ +*.h linguist-language=C +*.c linguist-language=C +Makefile -linguist-detectable diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..54899e8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +compiler +tags diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..49c0761 --- /dev/null +++ b/Makefile @@ -0,0 +1,7 @@ +FILES = tokenizer.c main.c parser.c +INCLUDES = -I. +CFLAGS = -std=c99 -g +OUTFILE = compiler + +main: ${FILES} + ${CC} ${CFLAGS} ${INCLUDES} -o ${OUTFILE} ${FILES} diff --git a/main.c b/main.c new file mode 100644 index 0000000..31684ad --- /dev/null +++ b/main.c @@ -0,0 +1,40 @@ +#include +#include +#include +#include +#include "tokenizer.h" + +const char* types[] = { + "keyword", "symbol", "integerConstant", "stringConstant", "identifier" +}; + +void printtks(TOKENLIST* tks, FILE* output) { + fprintf(output, "<%s> %s \r\n", types[tks->type], tks->token, types[tks->type]); + TOKENLIST* next = tks->next; + free(tks->token); + free(tks); + if(next != NULL) + printtks(next, output); +} + +int main(int argc, char* argv[]) { + if(argc < 2) { + fprintf(stderr, "Usage: %s {input file}\n", argv[0]); + return 1; + } + + FILE* input = fopen(argv[1], "r"); + + if(input == NULL) { + fprintf(stderr, "%s\n", strerror(errno)); + return errno; + } + + FILE* output = fopen("out.xml", "w"); + fprintf(output, "\r\n"); + printtks(tokenize(input), output); + fprintf(output, "\r\n"); + fclose(output); + + return 0; +} diff --git a/parser.c b/parser.c new file mode 100644 index 0000000..e69de29 diff --git a/parser.h b/parser.h new file mode 100644 index 0000000..e69de29 diff --git a/tokenizer.c b/tokenizer.c new file mode 100644 index 0000000..56712e7 --- /dev/null +++ b/tokenizer.c @@ -0,0 +1,224 @@ +#include +#include +#include +#include +#include "tokens.h" +#include "tokenizer.h" + +typedef enum { + common, charsymbol, space +} CHARTYPE; + +typedef struct { + char* str; + int size; + int count; +} STRING; + +TOKENLIST* mktokenlist() { + return (TOKENLIST*)malloc(sizeof(TOKENLIST)); +} + +CHARTYPE getchartype(unsigned char c) { + if(isspace(c)) return space; + if(isalnum(c) || c == '_' || c == '"') return common; + return charsymbol; +} + +void append(STRING* s, char c) { + int targsize = sizeof(char) * (s->count + 1); + if(s->size <= targsize) { + s->size = targsize * 2; + s->str = (char*)realloc(s->str, s->size); + } + + s->str[s->count] = c; + s->count++; +} + +STRING* mkstring(int size) { + STRING* str = (STRING*)malloc(sizeof(STRING)); + str->size = sizeof(char) * size; // initial size + str->str = (char*)malloc(str->size); + str->count = 0; + return str; +} + +bool iskeyword(STRING* tk) { + for(int i = 0; i < keywordssize; i++) + if(!strcmp(tk->str, keywords[i])) + return true; + return false; +} + +bool issymbol(STRING* tk) { + if(tk->count != 2) + return false; + for(int i = 0; i < symbolssize; i++) + if(!strcmp(tk->str, symbols[i])) + return true; + return false; +} + +bool isint(char* str) { + int i = 0; + while(str[i] != '\0') { + if(!isdigit(str[i])) + return false; + i++; + } + return true; +} + +bool isintcons(STRING* tk) { + if(!isint(tk->str)) + return false; + int val = atoi(tk->str); + return val >= 0 && val <= 32767; +} + +bool isidentifier(STRING* tk) { + if(isdigit(tk->str[0])) + return false; + + int count = tk->count - 1; + for(int i = 0; i < count; i++) + if(!isalnum(tk->str[i]) && tk->str[i] != '_') + return false; + return true; +} + +TOKENTYPE gettokentype(STRING* tk, int truen) { + if(iskeyword(tk)) return keyword; + if(issymbol(tk)) return symbol; + if(isintcons(tk)) return integer; + if(isidentifier(tk)) return identifier; + fprintf(stderr, "Unexpected token '%s'; line %i\n", tk->str, truen); + exit(1); +} + +TOKENLIST* appendtokenraw(TOKENLIST* curitem, STRING* token, int truen, TOKENTYPE type) { + curitem->token = (char*)malloc(sizeof(char)*token->count); + strcpy(curitem->token, token->str); + curitem->truen = truen; + curitem->type = type; + TOKENLIST* nextitem = mktokenlist(); + curitem->next = nextitem; + token->count = 0; + return nextitem; +} + +TOKENLIST* appendtoken(TOKENLIST* curitem, STRING* token, int truen) { + append(token, '\0'); + return appendtokenraw(curitem, token, truen, gettokentype(token, truen)); +} + +void skipln(FILE* input) { + unsigned char c; + while(c = fgetc(input), c != '\0') + if(c == '\n') + break; +} + +void skipmultiln(FILE* input, int* lnscount) { + unsigned char c; + while(c = fgetc(input), c != '\0') + if(c == '\n') + (*lnscount)++; + else if(c == '*') + if(fgetc(input) == '/') + break; +} + +bool handlecomment(FILE* input, int* lnscount) { + unsigned char nextc = fgetc(input); + if(nextc == '/') { + skipln(input); + (*lnscount)++; + return true; + } + else if(nextc == '*') { + unsigned char furtherc = fgetc(input); + if(furtherc == '*') { + skipmultiln(input, lnscount); + return true; + } + ungetc(furtherc, input); + } + ungetc(nextc, input); + return false; +} + +void readstr(FILE* input, STRING* tmp, int truen) { + unsigned char c; + while(c = fgetc(input), c != '\0') { + if(c == '\n') { + fprintf(stderr, "Unexpected end of line; line %i", truen); + exit(1); + } + if(c == '"') + break; + append(tmp, c); + } + append(tmp, '\0'); +} + +void freestr(STRING* str) { + free(str->str); + free(str); +} + +TOKENLIST* tokenize(FILE* input) { + TOKENLIST* head = mktokenlist(); + TOKENLIST* lastitem = head; + TOKENLIST* curitem = head; + + STRING* tmptoken = mkstring(200); + CHARTYPE lasttype = space; + CHARTYPE curtype; + + int lnscount = 1; + + unsigned char c; + while(c = fgetc(input), !feof(input)) { + if(c == '\n') { + lnscount++; + } + else if(c == '/' && handlecomment(input, &lnscount)) + continue; + else if(c == '"') { + if(lasttype != space) + curitem = appendtoken(curitem, tmptoken, lnscount); + readstr(input, tmptoken, lnscount); + lastitem = curitem; + curitem = appendtokenraw(curitem, tmptoken, lnscount, string); + lasttype = space; + continue; + } + + curtype = getchartype(c); + + if(curtype == common) { + if(lasttype == charsymbol) { + lastitem = curitem; + curitem = appendtoken(curitem, tmptoken, lnscount); + } + append(tmptoken, c); + } else { + if(lasttype != space){ + lastitem = curitem; + curitem = appendtoken(curitem, tmptoken, lnscount); + } + if(curtype == charsymbol) + append(tmptoken, c); + } + + lasttype = curtype; + } + + lastitem->next = NULL; + free(curitem); + freestr(tmptoken); + fclose(input); + return head; +} diff --git a/tokenizer.h b/tokenizer.h new file mode 100644 index 0000000..e06b903 --- /dev/null +++ b/tokenizer.h @@ -0,0 +1,18 @@ +#ifndef TOKENIZER_H +#define TOKENIZER_H +#include + +typedef enum { + keyword, symbol, integer, string, identifier +} TOKENTYPE; + +typedef struct tklist { + char* token; + TOKENTYPE type; + int truen; + struct tklist* next; +} TOKENLIST; + +TOKENLIST* tokenize(FILE* input); +void freetokenlist(TOKENLIST l); +#endif diff --git a/tokens.h b/tokens.h new file mode 100644 index 0000000..5990c9b --- /dev/null +++ b/tokens.h @@ -0,0 +1,17 @@ +#ifndef TOKENS_H +#define TOKENS_H + +const char* keywords[] = { + "class", "constructor", "function", "method", "field", "static", + "var", "int", "char", "boolean", "void", "true", "false", "null", + "this", "let", "do", "if", "else", "while", "return" +}; +const int keywordssize = sizeof(keywords) / sizeof(char*); + +const char* symbols[] = { + "{", "}", "(", ")", "[", "]", ".", ",", ";", "+", "-", "*", "/", + "&", "|", "<", ">", "=", "~" +}; +const int symbolssize = sizeof(symbols) / sizeof(char*); + +#endif