Add tokenizer

2020-11-30 18:44:22 -03:00
commit 10fbaf9a72
9 changed files with 311 additions and 0 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1,3 @@
+*.h linguist-language=C
+*.c linguist-language=C
+Makefile -linguist-detectable
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+compiler
+tags
--- a/7
+++ b/7
@@ -0,0 +1,7 @@
+FILES = tokenizer.c main.c parser.c
+INCLUDES = -I.
+CFLAGS = -std=c99 -g
+OUTFILE = compiler
+
+main: ${FILES}
+	${CC} ${CFLAGS} ${INCLUDES} -o ${OUTFILE} ${FILES}
--- a/main.c
+++ b/main.c
@@ -0,0 +1,40 @@
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <stdlib.h>
+#include "tokenizer.h"
+
+const char* types[] = {
+	"keyword", "symbol", "integerConstant", "stringConstant", "identifier"
+};
+
+void printtks(TOKENLIST* tks, FILE* output) {
+	fprintf(output, "<%s> %s </%s>\r\n", types[tks->type], tks->token, types[tks->type]);
+	TOKENLIST* next = tks->next;
+	free(tks->token);
+	free(tks);
+	if(next != NULL)
+		printtks(next, output);
+}
+
+int main(int argc, char* argv[]) {
+	if(argc < 2) {
+		fprintf(stderr, "Usage: %s {input file}\n", argv[0]);
+		return 1;
+	}
+
+	FILE* input = fopen(argv[1], "r");
+
+	if(input == NULL) {
+		fprintf(stderr, "%s\n", strerror(errno));
+		return errno;
+	}
+
+	FILE* output = fopen("out.xml", "w");
+	fprintf(output, "<tokens>\r\n");
+	printtks(tokenize(input), output);
+	fprintf(output, "</tokens>\r\n");
+	fclose(output);
+	
+	return 0;
+}
--- a/parser.c
+++ b/parser.c
--- a/parser.h
+++ b/parser.h
--- a/tokenizer.c
+++ b/tokenizer.c
@@ -0,0 +1,224 @@
+#include <ctype.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include "tokens.h"
+#include "tokenizer.h"
+
+typedef enum {
+	common, charsymbol, space
+} CHARTYPE;
+
+typedef struct {
+	char* str;
+	int size;
+	int count;
+} STRING;
+
+TOKENLIST* mktokenlist() {
+	return (TOKENLIST*)malloc(sizeof(TOKENLIST));
+}
+
+CHARTYPE getchartype(unsigned char c) {
+	if(isspace(c)) return space;
+	if(isalnum(c) || c == '_' || c == '"') return common;
+	return charsymbol;
+}
+
+void append(STRING* s, char c) {
+	int targsize = sizeof(char) * (s->count + 1);
+	if(s->size <= targsize) {
+		s->size = targsize * 2;
+		s->str = (char*)realloc(s->str, s->size);
+	}
+
+	s->str[s->count] = c;
+	s->count++;
+}
+
+STRING* mkstring(int size) {
+	STRING* str = (STRING*)malloc(sizeof(STRING));
+	str->size = sizeof(char) * size; // initial size
+	str->str = (char*)malloc(str->size);
+	str->count = 0;
+	return str;
+}
+
+bool iskeyword(STRING* tk) {
+	for(int i = 0; i < keywordssize; i++)
+		if(!strcmp(tk->str, keywords[i]))
+			return true;
+	return false;
+}
+
+bool issymbol(STRING* tk) {
+	if(tk->count != 2)
+		return false;
+	for(int i = 0; i < symbolssize; i++)
+		if(!strcmp(tk->str, symbols[i]))
+			return true;
+	return false;
+}
+
+bool isint(char* str) {
+	int i = 0;
+	while(str[i] != '\0') {
+		if(!isdigit(str[i]))
+			return false;
+		i++;
+	}
+	return true;
+}
+
+bool isintcons(STRING* tk) {
+	if(!isint(tk->str))
+		return false;
+	int val = atoi(tk->str);
+	return val >= 0 && val <= 32767;
+}
+
+bool isidentifier(STRING* tk) {
+	if(isdigit(tk->str[0]))
+		return false;
+
+	int count = tk->count - 1;
+	for(int i = 0; i < count; i++)
+		if(!isalnum(tk->str[i]) && tk->str[i] != '_')
+			return false;
+	return true;
+}
+
+TOKENTYPE gettokentype(STRING* tk, int truen) {
+	if(iskeyword(tk)) return keyword;
+	if(issymbol(tk)) return symbol;
+	if(isintcons(tk)) return integer;
+	if(isidentifier(tk)) return identifier;
+	fprintf(stderr, "Unexpected token '%s'; line %i\n", tk->str, truen);
+	exit(1);
+}
+
+TOKENLIST* appendtokenraw(TOKENLIST* curitem, STRING* token, int truen, TOKENTYPE type) {
+	curitem->token = (char*)malloc(sizeof(char)*token->count);
+	strcpy(curitem->token, token->str);
+	curitem->truen = truen;
+	curitem->type = type;
+	TOKENLIST* nextitem = mktokenlist();
+	curitem->next = nextitem;
+	token->count = 0;
+	return nextitem;
+}
+
+TOKENLIST* appendtoken(TOKENLIST* curitem, STRING* token, int truen) {
+	append(token, '\0');
+	return appendtokenraw(curitem, token, truen, gettokentype(token, truen));
+}
+
+void skipln(FILE* input) {
+	unsigned char c;
+	while(c = fgetc(input), c != '\0')
+		if(c == '\n')
+			break;
+}
+
+void skipmultiln(FILE* input, int* lnscount) {
+	unsigned char c;
+	while(c = fgetc(input), c != '\0')
+		if(c == '\n')
+			(*lnscount)++;
+		else if(c == '*')
+			if(fgetc(input) == '/')
+				break;
+}
+
+bool handlecomment(FILE* input, int* lnscount) {
+	unsigned char nextc = fgetc(input);
+	if(nextc == '/') {
+		skipln(input);
+		(*lnscount)++;
+		return true;
+	}
+	else if(nextc == '*') {
+		unsigned char furtherc = fgetc(input);
+		if(furtherc == '*') {
+			skipmultiln(input, lnscount);
+			return true;
+		}
+		ungetc(furtherc, input);
+	}
+	ungetc(nextc, input);
+	return false;
+}
+
+void readstr(FILE* input, STRING* tmp, int truen) {
+	unsigned char c;
+	while(c = fgetc(input), c != '\0') {
+		if(c == '\n') {
+			fprintf(stderr, "Unexpected end of line; line %i", truen);
+			exit(1);
+		}
+		if(c == '"')
+			break;
+		append(tmp, c);
+	}
+	append(tmp, '\0');
+}
+
+void freestr(STRING* str) {
+	free(str->str);
+	free(str);
+}
+
+TOKENLIST* tokenize(FILE* input) {
+	TOKENLIST* head = mktokenlist();
+	TOKENLIST* lastitem = head;
+	TOKENLIST* curitem = head;
+
+	STRING* tmptoken = mkstring(200);
+	CHARTYPE lasttype = space;
+	CHARTYPE curtype;
+
+	int lnscount = 1;
+	
+	unsigned char c;
+	while(c = fgetc(input), !feof(input)) {
+		if(c == '\n') {
+			lnscount++;
+		}
+		else if(c == '/' && handlecomment(input, &lnscount)) 
+			continue;
+		else if(c == '"') {
+			if(lasttype != space)
+				curitem = appendtoken(curitem, tmptoken, lnscount);
+			readstr(input, tmptoken, lnscount);
+			lastitem = curitem;
+			curitem = appendtokenraw(curitem, tmptoken, lnscount, string);
+			lasttype = space;
+			continue;
+		}
+
+		curtype = getchartype(c);
+
+		if(curtype == common) {
+			if(lasttype == charsymbol) {
+				lastitem = curitem;
+				curitem = appendtoken(curitem, tmptoken, lnscount);
+			}
+			append(tmptoken, c);
+		} else {
+			if(lasttype != space){
+				lastitem = curitem;
+				curitem = appendtoken(curitem, tmptoken, lnscount);
+			}
+			if(curtype == charsymbol)
+				append(tmptoken, c);
+		}
+		
+		lasttype = curtype;
+	}
+
+	lastitem->next = NULL;
+	free(curitem);
+	freestr(tmptoken);
+	fclose(input);
+	return head;
+}
--- a/tokenizer.h
+++ b/tokenizer.h
@@ -0,0 +1,18 @@
+#ifndef TOKENIZER_H
+#define TOKENIZER_H
+#include <stdio.h>
+
+typedef enum {
+	keyword, symbol, integer, string, identifier
+} TOKENTYPE;
+
+typedef struct tklist {
+	char* token;
+	TOKENTYPE type;
+	int truen;
+	struct tklist* next;
+} TOKENLIST;
+
+TOKENLIST* tokenize(FILE* input);
+void freetokenlist(TOKENLIST l);
+#endif
--- a/tokens.h
+++ b/tokens.h
@@ -0,0 +1,17 @@
+#ifndef TOKENS_H
+#define TOKENS_H
+
+const char* keywords[] = {
+	"class", "constructor", "function", "method", "field", "static",
+	"var", "int", "char", "boolean", "void", "true", "false", "null",
+	"this", "let", "do", "if", "else", "while", "return"
+};
+const int keywordssize = sizeof(keywords) / sizeof(char*);
+
+const char* symbols[] = {
+	"{", "}", "(", ")", "[", "]", ".", ",", ";", "+", "-", "*", "/",
+	"&", "|", "<", ">", "=", "~"
+};
+const int symbolssize = sizeof(symbols) / sizeof(char*);
+
+#endif