From c629a01b593c8fa425d624d94f52acd336afffeb Mon Sep 17 00:00:00 2001
From: Augusto Gunsch <augustogunsch@tutanota.com>
Date: Mon, 21 Dec 2020 18:11:23 -0300
Subject: [PATCH] Reorganize tokenizer

---
 compiler-scopes.c              |   6 +-
 compiler.c                     |  10 ++--
 parser.c                       |   6 +-
 tokens.h => tokenizer-tables.h |   9 +--
 tokenizer.c                    | 106 ++++++++++++++++++++-------------
 tokenizer.h                    |   7 ++-
 util.h                         |   5 ++
 7 files changed, 92 insertions(+), 57 deletions(-)
 rename tokens.h => tokenizer-tables.h (68%)

diff --git a/compiler-scopes.c b/compiler-scopes.c
index f7606b4..84d0c8f 100644
--- a/compiler-scopes.c
+++ b/compiler-scopes.c
@@ -79,18 +79,18 @@ VARDEC* tovardec(OBJ* obj) {
 
 void doubledeclaration(char* name, DEBUGINFO* debug, OBJ* other) {
 	DEBUGINFO* debugother = other->getdebug(other);
-	fprintf(stderr, "Double declaration of '%s' at '%s', line %i; previously defined at '%s', line %i\n",
+	eprintf("Double declaration of '%s' at '%s', line %i; previously defined at '%s', line %i\n",
 				name, debug->file, debug->definedat, debugother->file, debugother->definedat);
 	exit(1);
 }
 
 void notdeclared(char* name, DEBUGINFO* debug) {
-	fprintf(stderr, "'%s' not declared; file '%s', line %i\n", name, debug->file, debug->definedat);
+	eprintf("'%s' not declared; file '%s', line %i\n", name, debug->file, debug->definedat);
 	exit(1);
 }
 
 void invalidparent(SUBROUTCALL* call) {
-	fprintf(stderr, "Invalid subroutine parent '%s'; file '%s', line %i\n", call->parentname, call->debug->file, call->debug->definedat);
+	eprintf("Invalid subroutine parent '%s'; file '%s', line %i\n", call->parentname, call->debug->file, call->debug->definedat);
 	exit(1);
 }
 
diff --git a/compiler.c b/compiler.c
index c63bd4c..e840f84 100644
--- a/compiler.c
+++ b/compiler.c
@@ -63,11 +63,11 @@ LINE* mathopln(char op) {
 		return onetoken("and");
 	if(op == '/') {
 		char* tokens[] = { "call", "Math.divide", "2" };
-		return mksimpleln(tokens, sizeof(tokens) / sizeof(char*));
+		return mksimpleln(tokens, strcount(tokens));
 	}
 	if(op == '*') {
 		char* tokens[] = { "call", "Math.multiply", "2" };
-		return mksimpleln(tokens, sizeof(tokens) / sizeof(char*));
+		return mksimpleln(tokens, strcount(tokens));
 	}
 }
 
@@ -77,7 +77,7 @@ LINEBLOCK* compileexpression(SCOPE* s, TERM* e) {
 
 	if(e->type == intconstant) {
 		char* tokens[] = { "push", "constant", itoa(e->integer) };
-		myblk = mklnblk(mksimpleln(tokens, sizeof(tokens) / sizeof(char*)));
+		myblk = mklnblk(mksimpleln(tokens, strcount(tokens)));
 	}
 	else if(e->type == unaryopterm) {
 		myblk = compileexpression(s, e->expression);
@@ -88,7 +88,7 @@ LINEBLOCK* compileexpression(SCOPE* s, TERM* e) {
 		myblk = compileexpression(s, e->expression);
 	}
 	else {
-		fprintf(stderr, "Unsupported term yet %i\n", e->type);
+		eprintf("Unsupported term yet %i\n", e->type);
 		exit(1);
 	}
 
@@ -178,7 +178,7 @@ LINEBLOCK* compilestatement(SCOPE* s, CLASS* c, STATEMENT* st) {
 	else if(st->type == returnstatement)
 		return compileret(s, st->retst);
 	else {
-		fprintf(stderr, "UNSUPPORTED\n");
+		eprintf("UNSUPPORTED\n");
 		exit(1);
 	}
 }
diff --git a/parser.c b/parser.c
index f338f7d..d305c00 100644
--- a/parser.c
+++ b/parser.c
@@ -42,7 +42,7 @@ const char* tokentypes[] = {
 DEBUGINFO* getdebug(PARSER* p) {
 	DEBUGINFO* d = (DEBUGINFO*)malloc(sizeof(DEBUGINFO));
 	d->file = p->file;
-	d->definedat = p->current->truen;
+	d->definedat = p->current->definedat;
 	return d;
 }
 
@@ -59,7 +59,7 @@ void restorecp(PARSER* p) {
 }
 
 void unexpectedtoken(PARSER* p) {
-	fprintf(stderr, "Unexpected token '%s' (of type %s); line %i, file '%s'\n", p->current->token, tokentypes[p->current->type], p->current->truen, p->file);
+	fprintf(stderr, "Unexpected token '%s' (of type %s); line %i, file '%s'\n", p->current->token, tokentypes[p->current->type], p->current->definedat, p->file);
 }
 
 void unexpected(PARSER* p) {
@@ -75,7 +75,7 @@ void checkcontent(PARSER* p, const char* content) {
 
 void checktype(PARSER* p, TOKENTYPE type) {
 	if(p->current->type != type) {
-		fprintf(stderr, "Unexpected %s; line %i, file '%s'\n", tokentypes[p->current->type], p->current->truen, p->file);
+		fprintf(stderr, "Unexpected %s; line %i, file '%s'\n", tokentypes[p->current->type], p->current->definedat, p->file);
 		exit(1);
 	}
 }
diff --git a/tokens.h b/tokenizer-tables.h
similarity index 68%
rename from tokens.h
rename to tokenizer-tables.h
index 5990c9b..768ebc7 100644
--- a/tokens.h
+++ b/tokenizer-tables.h
@@ -1,17 +1,18 @@
-#ifndef TOKENS_H
-#define TOKENS_H
+#ifndef TOKENIZER_TABLES_H
+#define TOKENIZER_TABLES_H
+#include "util.h"
 
 const char* keywords[] = {
 	"class", "constructor", "function", "method", "field", "static",
 	"var", "int", "char", "boolean", "void", "true", "false", "null",
 	"this", "let", "do", "if", "else", "while", "return"
 };
-const int keywordssize = sizeof(keywords) / sizeof(char*);
+const int keywordssize = strcount(keyword);
 
 const char* symbols[] = {
 	"{", "}", "(", ")", "[", "]", ".", ",", ";", "+", "-", "*", "/",
 	"&", "|", "<", ">", "=", "~"
 };
-const int symbolssize = sizeof(symbols) / sizeof(char*);
+const int symbolssize = strcount(symbols);
 
 #endif 
diff --git a/tokenizer.c b/tokenizer.c
index e32c2d8..3838b4b 100644
--- a/tokenizer.c
+++ b/tokenizer.c
@@ -2,9 +2,10 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdbool.h>
-#include "tokens.h"
 #include "tokenizer.h"
+#include "tokenizer-tables.h"
 
+// Data types
 typedef enum {
 	common, charsymbol, space
 } CHARTYPE;
@@ -15,14 +16,38 @@ typedef struct {
 	int count;
 } STRING;
 
-TOKEN* mktokenlist() {
-	return (TOKEN*)malloc(sizeof(TOKEN));
-}
+// String manipulation
+STRING* mkstring(int size);
+void append(STRING* s, char c);
+void freestr(STRING* str);
 
-CHARTYPE getchartype(unsigned char c) {
-	if(isspace(c)) return space;
-	if(isalnum(c) || c == '_' || c == '"') return common;
-	return charsymbol;
+// Token manipulation;
+TOKEN* appendtokenraw(TOKEN* curitem, STRING* token, int definedat, TOKENTYPE type);
+TOKEN* appendtoken(TOKEN* curitem, STRING* token, int definedat);
+#define mktoken() (TOKEN*)malloc(sizeof(TOKEN))
+
+// Char types
+CHARTYPE getchartype(unsigned char c);
+bool iskeyword(STRING* tk);
+bool issymbol(STRING* tk);
+bool isint(char* str);
+bool isintcons(STRING* tk);
+bool isidentifier(STRING* tk);
+TOKENTYPE gettokentype(STRING* tk, int definedat);
+
+// Stream handling
+void skipln(FILE* input);
+void skipmultiln(FILE* input, int* lnscount);
+bool handlecomment(FILE* input, int* lnscount);
+void readstr(FILE* input, STRING* tmp, int definedat);
+
+// String manipulation
+STRING* mkstring(int size) {
+	STRING* str = (STRING*)malloc(sizeof(STRING));
+	str->size = sizeof(char) * size; // initial size
+	str->str = (char*)malloc(str->size);
+	str->count = 0;
+	return str;
 }
 
 void append(STRING* s, char c) {
@@ -36,12 +61,33 @@ void append(STRING* s, char c) {
 	s->count++;
 }
 
-STRING* mkstring(int size) {
-	STRING* str = (STRING*)malloc(sizeof(STRING));
-	str->size = sizeof(char) * size; // initial size
-	str->str = (char*)malloc(str->size);
-	str->count = 0;
-	return str;
+void freestr(STRING* str) {
+	free(str->str);
+	free(str);
+}
+
+// Token manipulation;
+TOKEN* appendtokenraw(TOKEN* curitem, STRING* token, int definedat, TOKENTYPE type) {
+	curitem->token = (char*)malloc(sizeof(char)*token->count);
+	strcpy(curitem->token, token->str);
+	curitem->definedat = definedat;
+	curitem->type = type;
+	TOKEN* nextitem = mktoken();
+	curitem->next = nextitem;
+	token->count = 0;
+	return nextitem;
+}
+
+TOKEN* appendtoken(TOKEN* curitem, STRING* token, int definedat) {
+	append(token, '\0');
+	return appendtokenraw(curitem, token, definedat, gettokentype(token, definedat));
+}
+
+// Char types
+CHARTYPE getchartype(unsigned char c) {
+	if(isspace(c)) return space;
+	if(isalnum(c) || c == '_' || c == '"') return common;
+	return charsymbol;
 }
 
 bool iskeyword(STRING* tk) {
@@ -88,31 +134,16 @@ bool isidentifier(STRING* tk) {
 	return true;
 }
 
-TOKENTYPE gettokentype(STRING* tk, int truen) {
+TOKENTYPE gettokentype(STRING* tk, int definedat) {
 	if(iskeyword(tk)) return keyword;
 	if(issymbol(tk)) return symbol;
 	if(isintcons(tk)) return integer;
 	if(isidentifier(tk)) return identifier;
-	fprintf(stderr, "Unexpected token '%s'; line %i\n", tk->str, truen);
+	eprintf("Unexpected token '%s'; line %i\n", tk->str, definedat);
 	exit(1);
 }
 
-TOKEN* appendtokenraw(TOKEN* curitem, STRING* token, int truen, TOKENTYPE type) {
-	curitem->token = (char*)malloc(sizeof(char)*token->count);
-	strcpy(curitem->token, token->str);
-	curitem->truen = truen;
-	curitem->type = type;
-	TOKEN* nextitem = mktokenlist();
-	curitem->next = nextitem;
-	token->count = 0;
-	return nextitem;
-}
-
-TOKEN* appendtoken(TOKEN* curitem, STRING* token, int truen) {
-	append(token, '\0');
-	return appendtokenraw(curitem, token, truen, gettokentype(token, truen));
-}
-
+// Stream handling
 void skipln(FILE* input) {
 	unsigned char c;
 	while(c = fgetc(input), c != '\0')
@@ -149,11 +180,11 @@ bool handlecomment(FILE* input, int* lnscount) {
 	return false;
 }
 
-void readstr(FILE* input, STRING* tmp, int truen) {
+void readstr(FILE* input, STRING* tmp, int definedat) {
 	unsigned char c;
 	while(c = fgetc(input), c != '\0') {
 		if(c == '\n') {
-			fprintf(stderr, "Unexpected end of line; line %i", truen);
+			eprintf("Unexpected end of line; line %i", definedat);
 			exit(1);
 		}
 		if(c == '"')
@@ -163,13 +194,8 @@ void readstr(FILE* input, STRING* tmp, int truen) {
 	append(tmp, '\0');
 }
 
-void freestr(STRING* str) {
-	free(str->str);
-	free(str);
-}
-
 TOKEN* tokenize(FILE* input) {
-	TOKEN* head = mktokenlist();
+	TOKEN* head = mktoken();
 	TOKEN* lastitem = head;
 	TOKEN* curitem = head;
 
diff --git a/tokenizer.h b/tokenizer.h
index e631a42..5c4cda6 100644
--- a/tokenizer.h
+++ b/tokenizer.h
@@ -2,6 +2,9 @@
 #define TOKENIZER_H
 #include <stdio.h>
 
+/* tokenizer
+ * Simple tool that splits a stream into many tokens. */
+
 typedef enum {
 	keyword, identifier, symbol, integer, string
 } TOKENTYPE;
@@ -9,10 +12,10 @@ typedef enum {
 typedef struct token {
 	char* token;
 	TOKENTYPE type;
-	int truen;
+	int definedat;
 	struct token* next;
 } TOKEN;
 
 TOKEN* tokenize(FILE* input);
-void freetokenlist(TOKEN l);
+void freetokenlist(TOKEN* list);
 #endif
diff --git a/util.h b/util.h
index 61521c6..8122ced 100644
--- a/util.h
+++ b/util.h
@@ -5,6 +5,11 @@
 /* util
  * Random utilities. */
 
+// Macros
+#define eprintf(...) fprintf (stderr, __VA_ARGS__)
+#define count(array, type) ((sizeof(array)) / (sizeof(type)))
+#define strcount(array) count(array, char*)
+
 typedef struct stringlist {
 	char* content;
 	struct stringlist* next;