hyc

Hydrogen Compiler written in C
git clone git://git.kocotian.pl/hyc.git
Log | Files | Refs | README | LICENSE

tokenize.c (5701B)


      1 /*
      2    hyc - Hydrogen Compiler written in C
      3    Copyright (C) 2021  Kacper Kocot <kocotian@kocotian.pl>
      4 
      5    This program is free software; you can redistribute it and/or modify
      6    it under the terms of the GNU General Public License as published by
      7    the Free Software Foundation; either version 3 of the License, or
      8    (at your option) any later version.
      9 
     10    This program is distributed in the hope that it will be useful,
     11    but WITHOUT ANY WARRANTY; without even the implied warranty of
     12    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     13    GNU General Public License for more details.
     14 
     15    You should have received a copy of the GNU General Public License
     16    along with this program; if not, write to the Free Software Foundation,
     17    Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA
     18 
     19 */
     20 
     21 #include <tokenize.h>
     22 #include <ctype.h>
     23 #include <stdlib.h>
     24 #include <string.h>
     25 
     26 Token
     27 newToken(void)
     28 {
     29 	Token t;
     30 	return *(Token *)(memset(&t, 0, sizeof t));
     31 }
     32 
     33 ssize_t
     34 tokenize(String input, Token **output)
     35 #define NOT_OVERFLOW (i < input.len)
     36 #define CURCHAR (input.data[i])
     37 #define NEXTCHAR (i + 1 < input.len ? input.data[i + 1] : 0)
     38 #define CHARPLUS(N) (i + (N) < input.len ? input.data[i + (N)] : 0)
     39 #define CURTOK ((*output)[tokcount - 1])
     40 #define TYPE(T) (CURTOK.type = Token##T)
     41 #define SEEKCHAR (++i, ++col)
     42 {
     43 	size_t tokcount;
     44 	size_t i, initial_i;
     45 	size_t ln, col;
     46 
     47 	*output = malloc(tokcount = 0);
     48 	i = ln = col = 0;
     49 
     50 	while (NOT_OVERFLOW) {
     51 		*output = realloc(*output, (++tokcount) * sizeof (**output));
     52 		CURTOK = newToken();
     53 
     54 		while (isspace(CURCHAR) && NOT_OVERFLOW) {
     55 			if (CURCHAR == '\n') {
     56 				++ln; col = 0;
     57 				++i;
     58 			} else SEEKCHAR;
     59 		}
     60 
     61 		CURTOK.str.data = input.data + i;
     62 		CURTOK.line = ln;
     63 		CURTOK.col = col;
     64 
     65 		initial_i = i;
     66 
     67 		if (!NOT_OVERFLOW) {
     68 			break;
     69 		/* Identifier */
     70 		} else if ((CURCHAR >= 'a' && CURCHAR <= 'z')
     71 		     ||  (CURCHAR >= 'A' && CURCHAR <= 'Z')
     72 			 ||  (CURCHAR == '_')) {
     73 			TYPE(Identifier);
     74 			SEEKCHAR;
     75 			while (((CURCHAR >= 'a' && CURCHAR <= 'z')
     76 			   ||   (CURCHAR >= 'A' && CURCHAR <= 'Z')
     77 			   ||   (CURCHAR >= '0' && CURCHAR <= '9')
     78 			   ||   (CURCHAR == '_'))
     79 			   &&  NOT_OVERFLOW) SEEKCHAR;
     80 		/* Integer */
     81 		} else if (CURCHAR >= '0' && CURCHAR <= '9') {
     82 			TYPE(Integer);
     83 			SEEKCHAR;
     84 			while ((CURCHAR >= '0' && CURCHAR <= '9')
     85 			   &&  NOT_OVERFLOW) SEEKCHAR;
     86 		/* String */
     87 		} else if (CURCHAR == '"') {
     88 			TYPE(String);
     89 			SEEKCHAR;
     90 			while (CURCHAR != '"' && NOT_OVERFLOW) SEEKCHAR;
     91 			SEEKCHAR;
     92 		/* Unary and conditional operators */
     93 		} else if (CURCHAR == '!') {
     94 			TYPE(ExclamationMark);
     95 			if (NEXTCHAR == '=') {
     96 				TYPE(NotEqual); SEEKCHAR;
     97 			}
     98 			SEEKCHAR;
     99 		} else if (CURCHAR == '=') {
    100 			TYPE(Assignment);
    101 			if (NEXTCHAR == '=') {
    102 				TYPE(Equal); SEEKCHAR;
    103 			}
    104 			SEEKCHAR;
    105 		} else if (CURCHAR == '-') {
    106 			TYPE(Minus);
    107 			if (NEXTCHAR == '-') {
    108 				TYPE(MinusMinus); SEEKCHAR;
    109 			} else if (NEXTCHAR == '=') {
    110 				TYPE(MinusEqual); SEEKCHAR;
    111 			}
    112 			SEEKCHAR;
    113 		} else if (CURCHAR == '+') {
    114 			TYPE(Plus);
    115 			if (NEXTCHAR == '+') {
    116 				TYPE(PlusPlus); SEEKCHAR;
    117 			} else if (NEXTCHAR == '=') {
    118 				TYPE(PlusEqual); SEEKCHAR;
    119 			}
    120 			SEEKCHAR;
    121 		/* Brackets */
    122 		} else if (CURCHAR == '(') { TYPE(OpeningParenthesis); SEEKCHAR;
    123 		} else if (CURCHAR == ')') { TYPE(ClosingParenthesis); SEEKCHAR;
    124 		} else if (CURCHAR == '[') { TYPE(OpeningBracket); SEEKCHAR;
    125 		} else if (CURCHAR == ']') { TYPE(ClosingBracket); SEEKCHAR;
    126 		} else if (CURCHAR == '{') { TYPE(OpeningBrace); SEEKCHAR;
    127 		} else if (CURCHAR == '}') { TYPE(ClosingBrace); SEEKCHAR;
    128 		/* Other single-char operators */
    129 		} else if (CURCHAR == '*') { TYPE(Asterisk); SEEKCHAR;
    130 		} else if (CURCHAR == '&') { TYPE(Amperstand); SEEKCHAR;
    131 		} else if (CURCHAR == ';') { TYPE(Semicolon); SEEKCHAR;
    132 		} else if (CURCHAR == ',') { TYPE(Comma); SEEKCHAR;
    133 		} else if (CURCHAR == '.') { TYPE(Dot); SEEKCHAR;
    134 		} else if (CURCHAR == ':') {
    135 			TYPE(Colon);
    136 			if (NEXTCHAR == ':') {
    137 				TYPE(DoubleColon); SEEKCHAR;
    138 			}
    139 			SEEKCHAR;
    140 		} else {
    141 			SEEKCHAR;
    142 		}
    143 
    144 		CURTOK.str.len = i - initial_i;
    145 	}
    146 
    147 	return (signed)--tokcount;
    148 }
    149 #undef NOT_OVERFLOW
    150 #undef CURCHAR
    151 #undef NEXTCHAR
    152 #undef CHARPLUS
    153 #undef CURTOK
    154 #undef TYPE
    155 #undef SEEKCHAR
    156 
    157 char *
    158 strTokenType(TokenType type)
    159 {
    160 	switch (type) {
    161 	case TokenNULL: return "<null>"; break;
    162 	case TokenIdentifier: return "identifier"; break;
    163 	case TokenInteger: return "integer"; break;
    164 	case TokenString: return "string"; break;
    165 	case TokenOpeningParenthesis: return "opening parenthesis"; break;
    166 	case TokenClosingParenthesis: return "closing parenthesis"; break;
    167 	case TokenOpeningBracket: return "opening bracket"; break;
    168 	case TokenClosingBracket: return "closing bracket"; break;
    169 	case TokenOpeningBrace: return "opening brace"; break;
    170 	case TokenClosingBrace: return "closing brace"; break;
    171 	case TokenExclamationMark: return "exclamation mark"; break;
    172 	case TokenAssignment: return "assignment sign"; break;
    173 	case TokenNotEqual: return "not equal sign"; break;
    174 	case TokenEqual: return "equal sign"; break;
    175 	case TokenMinus: return "minus sign"; break;
    176 	case TokenMinusMinus: return "double minus sign"; break;
    177 	case TokenMinusEqual: return "minus equal sign"; break;
    178 	case TokenPlus: return "plus sign"; break;
    179 	case TokenPlusPlus: return "double plus sign"; break;
    180 	case TokenPlusEqual: return "plus equal sign"; break;
    181 	case TokenAsterisk: return "asterisk"; break;
    182 	case TokenAmperstand: return "amperstand"; break;
    183 	case TokenSemicolon: return "semicolon"; break;
    184 	case TokenComma: return "comma"; break;
    185 	case TokenDot: return "dot"; break;
    186 	case TokenColon: return "colon"; break;
    187 	case TokenDoubleColon: return "double colon"; break;
    188 	}
    189 	return "unknown";
    190 }