|
@@ -0,0 +1,420 @@
|
|
|
+//
|
|
|
+// main.c
|
|
|
+// Exp2_Parser
|
|
|
+//
|
|
|
+// Created by xcbosa on 2022/4/29.
|
|
|
+//
|
|
|
+
|
|
|
+#include <stdio.h>
|
|
|
+#include <stdlib.h>
|
|
|
+#include <string.h>
|
|
|
+
|
|
|
+#define true 1
|
|
|
+#define false 0
|
|
|
+
|
|
|
+typedef char *String;
|
|
|
+typedef int Bool;
|
|
|
+
|
|
|
+Bool stringEquals(String left, String right) {
|
|
|
+ if (left == right) {
|
|
|
+ // 常量可以直接判断地址,地址相等串内容一定相等,加速比较
|
|
|
+ return true;
|
|
|
+ }
|
|
|
+ return strcmp(left, right) == 0;
|
|
|
+}
|
|
|
+
|
|
|
+String stringFromStackString(String stackString) {
|
|
|
+ String str = (String)calloc(strlen(stackString) + 1, sizeof(char));
|
|
|
+ strcpy(str, stackString);
|
|
|
+ return str;
|
|
|
+}
|
|
|
+
|
|
|
+// MARK: - 手动带缓存的File,C的File居然不能回退字符...
|
|
|
+
|
|
|
+typedef struct {
|
|
|
+ FILE *fp;
|
|
|
+ char buf[1024];
|
|
|
+ int bufSize;
|
|
|
+} CachedFile;
|
|
|
+
|
|
|
+typedef struct {
|
|
|
+ fpos_t fpos;
|
|
|
+ CachedFile lastState;
|
|
|
+} CachedFileFPos;
|
|
|
+
|
|
|
+CachedFile *cached_fopen(const String file) {
|
|
|
+ FILE *fp = fopen(file, "r");
|
|
|
+ CachedFile *cached = (CachedFile *)calloc(1, sizeof(CachedFile));
|
|
|
+ cached->fp = fp;
|
|
|
+ cached->bufSize = 0;
|
|
|
+ return cached;
|
|
|
+}
|
|
|
+
|
|
|
+char cached_fgetc(CachedFile *file) {
|
|
|
+ if (file->bufSize > 0) {
|
|
|
+ char ch = file->buf[file->bufSize - 1];
|
|
|
+ file->bufSize--;
|
|
|
+ return ch;
|
|
|
+ }
|
|
|
+ return fgetc(file->fp);
|
|
|
+}
|
|
|
+
|
|
|
+void cached_fputc(char ch, CachedFile *file) {
|
|
|
+ file->buf[file->bufSize] = ch;
|
|
|
+ file->bufSize++;
|
|
|
+}
|
|
|
+
|
|
|
+int cached_fgetpos(CachedFile *file, CachedFileFPos *pos) {
|
|
|
+ fpos_t fpos;
|
|
|
+ int ret = fgetpos(file->fp, &fpos);
|
|
|
+ pos->fpos = fpos;
|
|
|
+ pos->lastState = *file;
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+int cached_fsetpos(CachedFile *file, CachedFileFPos *pos) {
|
|
|
+ memcpy(file->buf, pos->lastState.buf, 1024);
|
|
|
+ file->bufSize = pos->lastState.bufSize;
|
|
|
+ return fsetpos(file->fp, &pos->fpos);
|
|
|
+}
|
|
|
+
|
|
|
+// MARK: - Definitions
|
|
|
+
|
|
|
+// 根据指导书的常量串
|
|
|
+const String IDENFR = "IDENFR";
|
|
|
+const String INTCON = "INTCON";
|
|
|
+const String CHARCON = "CHARCON";
|
|
|
+const String STRCON = "STRCON";
|
|
|
+const String CONSTTK = "CONSTTK";
|
|
|
+const String INTTK = "INTTK";
|
|
|
+const String CHARTK = "CHARTK";
|
|
|
+const String VOIDTK = "VOIDTK";
|
|
|
+const String MAINTK = "MAINTK";
|
|
|
+const String IFTK = "IFTK";
|
|
|
+const String DOTK = "DOTK";
|
|
|
+
|
|
|
+const String ELSETK = "ELSETK";
|
|
|
+const String SWITCHTK = "SWITCHTK";
|
|
|
+const String CASETK = "CASETK";
|
|
|
+const String DEFAULTTK = "DEFAULTTK";
|
|
|
+const String WHILETK = "WHILETK";
|
|
|
+const String FORTK = "FORTK";
|
|
|
+const String SCANFTK = "SCANFTK";
|
|
|
+const String PRINTFTK = "PRINTFTK";
|
|
|
+const String RETURNTK = "RETURNTK";
|
|
|
+const String PLUS = "PLUS";
|
|
|
+
|
|
|
+const String MINU = "MINU";
|
|
|
+const String MULT = "MULT";
|
|
|
+const String DIV = "DIV";
|
|
|
+const String LSS = "LSS";
|
|
|
+const String LEQ = "LEQ";
|
|
|
+const String GRE = "GRE";
|
|
|
+const String GEQ = "GEQ";
|
|
|
+const String EQL = "EQL";
|
|
|
+const String NEQ = "NEQ";
|
|
|
+const String COLON = "COLON";
|
|
|
+
|
|
|
+const String ASSIGN = "ASSIGN";
|
|
|
+const String SEMICN = "SEMICN";
|
|
|
+const String COMMA = "COMMA";
|
|
|
+const String LPARENT = "LPARENT";
|
|
|
+const String RPARENT = "RPARENT";
|
|
|
+const String LBRACK = "LBRACK";
|
|
|
+const String RBRACK = "RBRACK";
|
|
|
+const String LBRACE = "LBRACE";
|
|
|
+const String RBRACE = "RBRACE";
|
|
|
+
|
|
|
+const String EOFTK = "EOF";
|
|
|
+
|
|
|
+struct Token;
|
|
|
+struct TokenRule;
|
|
|
+
|
|
|
+struct Token TokenEOF;
|
|
|
+
|
|
|
+// Token解析代码块,返回NULL代表规则无法解析
|
|
|
+typedef struct Token(*TokenBodyProviderBlock)(CachedFile*, struct TokenRule);
|
|
|
+
|
|
|
+typedef struct TokenRule {
|
|
|
+ String tokenBody;
|
|
|
+ String tokenTy;
|
|
|
+ // 解析规则
|
|
|
+ TokenBodyProviderBlock tokenBodyProvider;
|
|
|
+} TokenRule;
|
|
|
+
|
|
|
+typedef struct Token {
|
|
|
+ String tokenBody;
|
|
|
+ String tokenTy;
|
|
|
+ Bool isEmpty;
|
|
|
+} Token;
|
|
|
+
|
|
|
+const Token TokenEmpty = { NULL, NULL, true };
|
|
|
+
|
|
|
+/// 解析EOF
|
|
|
+/// @param file File
|
|
|
+/// @param rule 触发规则
|
|
|
+Token lexEOF(CachedFile *file, TokenRule rule);
|
|
|
+
|
|
|
+/// 字母单词解析,后边必须接一个非字母,防止局部认对(int -> in)
|
|
|
+/// @param file File
|
|
|
+/// @param rule 触发规则
|
|
|
+Token lexWords(CachedFile *file, TokenRule rule);
|
|
|
+
|
|
|
+/// 符号单词解析,后边无要求,需要确保rules中的顺序为先长后短防止局部认对(<= -> <)
|
|
|
+/// @param file File
|
|
|
+/// @param rule 触发规则
|
|
|
+Token lexSymbols(CachedFile *file, TokenRule rule);
|
|
|
+
|
|
|
+Token lexIDENFR(CachedFile *file, TokenRule rule);
|
|
|
+Token lexINTCON(CachedFile *file, TokenRule rule);
|
|
|
+Token lexCHARCON(CachedFile *file, TokenRule rule);
|
|
|
+Token lexSTRCON(CachedFile *file, TokenRule rule);
|
|
|
+
|
|
|
+TokenRule *rules;
|
|
|
+
|
|
|
+void doInitialization(void) {
|
|
|
+ TokenRule _rules[] = {
|
|
|
+ { NULL, EOFTK, &lexEOF },
|
|
|
+ { NULL, INTCON, &lexINTCON },
|
|
|
+ { NULL, CHARCON, &lexCHARCON },
|
|
|
+ { NULL, STRCON, &lexSTRCON },
|
|
|
+ { "const", CONSTTK, &lexWords },
|
|
|
+ { "int", INTTK, &lexWords },
|
|
|
+ { "char", CHARTK, &lexWords },
|
|
|
+ { "void", VOIDTK, &lexWords },
|
|
|
+ { "main", MAINTK, &lexWords },
|
|
|
+ { "if", IFTK, &lexWords },
|
|
|
+ { "do", DOTK, &lexWords },
|
|
|
+ { "else", ELSETK, &lexWords },
|
|
|
+ { "default", DEFAULTTK, &lexWords },
|
|
|
+ { "while", WHILETK, &lexWords },
|
|
|
+ { "for", FORTK, &lexWords },
|
|
|
+ { "scanf", SCANFTK, &lexWords },
|
|
|
+ { "printf", PRINTFTK, &lexWords },
|
|
|
+ { "return", RETURNTK, &lexWords },
|
|
|
+ { NULL, IDENFR, &lexIDENFR },
|
|
|
+ { "+", PLUS, &lexSymbols },
|
|
|
+ { "-", MINU, &lexSymbols },
|
|
|
+ { "*", MULT, &lexSymbols },
|
|
|
+ { "/", DIV, &lexSymbols },
|
|
|
+ { "<=", LEQ, &lexSymbols },
|
|
|
+ { "<", LSS, &lexSymbols },
|
|
|
+ { ">=", GEQ, &lexSymbols },
|
|
|
+ { ">", GRE, &lexSymbols },
|
|
|
+ { "==", EQL, &lexSymbols },
|
|
|
+ { "!=", NEQ, &lexSymbols },
|
|
|
+ { ":", COLON, &lexSymbols },
|
|
|
+ { "=", ASSIGN, &lexSymbols },
|
|
|
+ { ";", SEMICN, &lexSymbols },
|
|
|
+ { ",", COMMA, &lexSymbols },
|
|
|
+ { "(", LPARENT, &lexSymbols },
|
|
|
+ { ")", RPARENT, &lexSymbols },
|
|
|
+ { "[", LBRACK, &lexSymbols },
|
|
|
+ { "]", RBRACK, &lexSymbols },
|
|
|
+ { "{", LBRACE, &lexSymbols },
|
|
|
+ { "}", RBRACE, &lexSymbols },
|
|
|
+ { NULL, NULL, NULL }
|
|
|
+ };
|
|
|
+ rules = calloc(sizeof(_rules), 1);
|
|
|
+ memcpy(rules, _rules, sizeof(_rules));
|
|
|
+ TokenEOF.tokenTy = EOFTK;
|
|
|
+ TokenEOF.tokenBody = "";
|
|
|
+ TokenEOF.isEmpty = false;
|
|
|
+}
|
|
|
+
|
|
|
+Bool isAlphaOrNumberOrUnderline(char ch) {
|
|
|
+ return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || (ch >= '0' && ch <= '9') || ch == '_';
|
|
|
+}
|
|
|
+
|
|
|
+Bool isAlphaOrUnderline(char ch) {
|
|
|
+ return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') || ch == '_';
|
|
|
+}
|
|
|
+
|
|
|
+Bool isNumberOrXOrEOrPoint(char ch) {
|
|
|
+ return (ch >= '0' && ch <= '9') || ch == 'X' || ch == 'x' || ch == 'E' || ch == 'e' || ch == '.';
|
|
|
+}
|
|
|
+
|
|
|
+Bool isEmptyWord(char ch) {
|
|
|
+ return ch == ' ' || ch == '\n' || ch == '\r' || ch == '\t';
|
|
|
+}
|
|
|
+
|
|
|
+// MARK: - Lex Implementation
|
|
|
+
|
|
|
+/// 按规则解析下一个单词
|
|
|
+/// @param file File
|
|
|
+Token lexNext(CachedFile *file) {
|
|
|
+ TokenRule *ruleIterator = &rules[0];
|
|
|
+ while (ruleIterator->tokenTy) {
|
|
|
+ char ch = '\0';
|
|
|
+ while (isEmptyWord(ch = cached_fgetc(file))) ;
|
|
|
+ cached_fputc(ch, file);
|
|
|
+ CachedFileFPos fpos;
|
|
|
+ cached_fgetpos(file, &fpos);
|
|
|
+ Token result = ruleIterator->tokenBodyProvider(file, *ruleIterator);
|
|
|
+ if (result.isEmpty) {
|
|
|
+ cached_fsetpos(file, &fpos);
|
|
|
+ } else {
|
|
|
+ return result;
|
|
|
+ }
|
|
|
+ ruleIterator++;
|
|
|
+ }
|
|
|
+ return TokenEmpty;
|
|
|
+}
|
|
|
+
|
|
|
+Token lexEOF(CachedFile *file, TokenRule rule) {
|
|
|
+ char ch = cached_fgetc(file);
|
|
|
+ if (ch == '\0') {
|
|
|
+ return TokenEOF;
|
|
|
+ }
|
|
|
+ cached_fputc(ch, file);
|
|
|
+ return TokenEmpty;
|
|
|
+}
|
|
|
+
|
|
|
+Token lexWords(CachedFile *file, TokenRule rule) {
|
|
|
+ char ch = '\0';
|
|
|
+ char *pexp = &rule.tokenBody[0];
|
|
|
+ while ((ch = cached_fgetc(file)) != '\0') {
|
|
|
+ if (*pexp == '\0') {
|
|
|
+ // 需要接非字母和数字
|
|
|
+ if (isAlphaOrNumberOrUnderline(ch)) {
|
|
|
+ return TokenEmpty;
|
|
|
+ }
|
|
|
+ cached_fputc(ch, file);
|
|
|
+ Token tk = { rule.tokenBody, rule.tokenTy, false };
|
|
|
+ return tk;
|
|
|
+ }
|
|
|
+ if (ch != *pexp) {
|
|
|
+ return TokenEmpty;
|
|
|
+ }
|
|
|
+ pexp++;
|
|
|
+ }
|
|
|
+ return TokenEOF;
|
|
|
+}
|
|
|
+
|
|
|
+Token lexSymbols(CachedFile *file, TokenRule rule) {
|
|
|
+ char ch = '\0';
|
|
|
+ char *pexp = &rule.tokenBody[0];
|
|
|
+ while ((ch = cached_fgetc(file)) != '\0') {
|
|
|
+ if (*pexp == '\0') {
|
|
|
+ cached_fputc(ch, file);
|
|
|
+ Token tk = { rule.tokenBody, rule.tokenTy, false };
|
|
|
+ return tk;
|
|
|
+ }
|
|
|
+ if (ch != *pexp) {
|
|
|
+ return TokenEmpty;
|
|
|
+ }
|
|
|
+ pexp++;
|
|
|
+ }
|
|
|
+ return TokenEOF;
|
|
|
+}
|
|
|
+
|
|
|
+Token lexIDENFR(CachedFile *file, TokenRule rule) {
|
|
|
+ char ch = '\0';
|
|
|
+ Bool first = true;
|
|
|
+ char tmp[1024], *pt = &tmp[0];
|
|
|
+ while ((ch = cached_fgetc(file)) != '\0') {
|
|
|
+ if (first) {
|
|
|
+ if (!isAlphaOrUnderline(ch)) {
|
|
|
+ return TokenEmpty;
|
|
|
+ }
|
|
|
+ first = false;
|
|
|
+ }
|
|
|
+ if (!isAlphaOrNumberOrUnderline(ch)) {
|
|
|
+ cached_fputc(ch, file);
|
|
|
+ if (pt != &tmp[0]) {
|
|
|
+ *(pt++) = '\0';
|
|
|
+ Token tk = { stringFromStackString(tmp), IDENFR, false };
|
|
|
+ return tk;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ *(pt++) = ch;
|
|
|
+ }
|
|
|
+ return TokenEOF;
|
|
|
+}
|
|
|
+
|
|
|
+Token lexINTCON(CachedFile *file, TokenRule rule) {
|
|
|
+ char ch = '\0';
|
|
|
+ Bool first = true;
|
|
|
+ char tmp[1024], *pt = &tmp[0];
|
|
|
+ while ((ch = cached_fgetc(file)) != '\0') {
|
|
|
+ if (first) {
|
|
|
+ if (!(ch >= '0' && ch <= '9')) {
|
|
|
+ return TokenEmpty;
|
|
|
+ }
|
|
|
+ first = false;
|
|
|
+ }
|
|
|
+ if (!isNumberOrXOrEOrPoint(ch)) {
|
|
|
+ cached_fputc(ch, file);
|
|
|
+ if (pt != &tmp[0]) {
|
|
|
+ *(pt++) = '\0';
|
|
|
+ Token tk = { stringFromStackString(tmp), INTCON, false };
|
|
|
+ return tk;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ *(pt++) = ch;
|
|
|
+ }
|
|
|
+ return TokenEOF;
|
|
|
+}
|
|
|
+
|
|
|
+Token lexCHARCON(CachedFile *file, TokenRule rule) {
|
|
|
+ char tmp[1024], *pt = &tmp[0];
|
|
|
+ if (cached_fgetc(file) != '\'') {
|
|
|
+ return TokenEmpty;
|
|
|
+ }
|
|
|
+ //*(pt++) = '\'';
|
|
|
+ char c1 = cached_fgetc(file);
|
|
|
+ *(pt++) = c1;
|
|
|
+ if (c1 == '\\') {
|
|
|
+ *(pt++) = cached_fgetc(file);
|
|
|
+ }
|
|
|
+ char end = cached_fgetc(file);
|
|
|
+ if (end == '\'') {
|
|
|
+ //*(pt++) = end;
|
|
|
+ *(pt++) = '\0';
|
|
|
+ Token tk = { stringFromStackString(tmp), CHARCON, false };
|
|
|
+ return tk;
|
|
|
+ }
|
|
|
+ printf("Error: Un breaking char constant: Expect ', but got %c", end);
|
|
|
+ exit(-1);
|
|
|
+}
|
|
|
+
|
|
|
+Token lexSTRCON(CachedFile *file, TokenRule rule) {
|
|
|
+ char tmp[1024], *pt = &tmp[0];
|
|
|
+ if (cached_fgetc(file) != '\"') {
|
|
|
+ return TokenEmpty;
|
|
|
+ }
|
|
|
+ //*(pt++) = '\"';
|
|
|
+ while (true) {
|
|
|
+ char ch = cached_fgetc(file);
|
|
|
+ if (ch != '\"')
|
|
|
+ *(pt++) = ch;
|
|
|
+ if (ch == '\\') {
|
|
|
+ char ch2 = cached_fgetc(file);
|
|
|
+ *(pt++) = ch2;
|
|
|
+ }
|
|
|
+ if (ch == '\"') {
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ if (ch == '\0' || ch == '\n') {
|
|
|
+ printf("Error: Un breaking string constant, expect \", but got nothing");
|
|
|
+ exit(-1);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ *(pt++) = '\0';
|
|
|
+ Token tk = { stringFromStackString(tmp), STRCON, false };
|
|
|
+ return tk;
|
|
|
+}
|
|
|
+
|
|
|
+int main(int argc, const char * argv[]) {
|
|
|
+ doInitialization();
|
|
|
+ CachedFile *fp = cached_fopen("testfile.txt");
|
|
|
+ FILE *wp = fopen("output.txt", "w+");
|
|
|
+ Token currentWord;
|
|
|
+ while (!(currentWord = lexNext(fp)).isEmpty) {
|
|
|
+ if (stringEquals(currentWord.tokenTy, EOFTK)) {
|
|
|
+ return 0;
|
|
|
+ }
|
|
|
+ fprintf(wp, "%s %s\n", currentWord.tokenTy, currentWord.tokenBody);
|
|
|
+ }
|
|
|
+ return 0;
|
|
|
+}
|