From f379ebd87e89a6ed82309cf870d9f72fbc95632b Mon Sep 17 00:00:00 2001 From: Lerking Date: Sat, 15 Dec 2018 01:16:36 +0100 Subject: [PATCH] Added more parsing functionallity. --- .gitignore | 1 + h2inc.py | 12 ++-- parser.py | 178 ++++++++++++++++++++++++++++------------------------- 3 files changed, 100 insertions(+), 91 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e5421ed --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +__pycache__/parser.cpython-36.pyc diff --git a/h2inc.py b/h2inc.py index dd69701..93abddd 100644 --- a/h2inc.py +++ b/h2inc.py @@ -72,13 +72,10 @@ class H2INC: break self.tupfile = [] for lines in fh: - #outfile = outfile+lines #Initial phase - self.tupline = [] - analyzed_line = parse.analyzer(lines) - self.tupfile.append(analyzed_line) - #self.tupfile.append(lines) - passes += 1 + self.tupfile.append(lines) fh.close() + resultfile = parse.parseheader(self.tupfile) + print(resultfile) #outputfile = os.path.splitext(inputfile)[0]+'.inc' #outputfile = str(outputfile).replace(self.sourcedir, self.destdir) @@ -108,5 +105,4 @@ if __name__ == "__main__": #print(app.folderlist) #for f in app.filelist: #app.read_file(f) - app.read_file("gtk.h") #testfile for comments and header includes - print(app.tupfile) + app.read_file("gtk.h") #testfile for comments and header includes \ No newline at end of file diff --git a/parser.py b/parser.py index 0d6c056..6b943ab 100644 --- a/parser.py +++ b/parser.py @@ -1,9 +1,15 @@ """Contains class PARSER""" from itertools import count -TOKENS = ['CSTART','CMID','CEND','RPAREN','LPAREN','ENDLINE','RETVAL','PREPROCESS', - 'ID','PLUS','MINUS','DIV','MULT','ASSIGN','EQUAL','LBRACE','RBRACE', - 'COMMA','SEMICOLON','LANGLE','RANGLE','POINTER'] +#Element type definitions. Used in the parse process. +ELEMENT_TYPE_PREPROCESS = 1 +ELEMENT_TYPE_REGULAR = 2 + +TOKENS = ['TOKEN_CSTART','TOKEN_CMID','TOKEN_CEND','TOKEN_RPAREN', + 'TOKEN_LPAREN','TOKEN_ENDLINE','TOKEN_RETVAL','TOKEN_PREPROCESS', + 'TOKEN_ID','TOKEN_PLUS','TOKEN_MINUS','TOKEN_DIV','TOKEN_MULT', + 'TOKEN_ASSIGN','TOKEN_EQUAL','TOKEN_LBRACE','TOKEN_RBRACE', + 'TOKEN_COMMA','TOKEN_SEMICOLON','TOKEN_LANGLE','TOKEN_RANGLE','TOKEN_POINTER'] RESERVED = {'auto' : 'AUTO','break' : 'BREAK','case' : 'CASE','char' : 'CHAR', 'const' : 'CONST','continue' : 'CONTINUE','default' : 'DEFAULT','do' : 'DO', @@ -14,80 +20,55 @@ RESERVED = {'auto' : 'AUTO','break' : 'BREAK','case' : 'CASE','char' : 'CHAR', 'double' : 'DOUBLE','else' : 'ELSE','enum' : 'ENUM','extern' : 'EXTERN', 'float' : 'FLOAT','for' : 'FOR','goto' : 'GOTO','if' : 'IF'} -PREPROCESSOR_DIRECTIVES = {'#include' : 'PREPROCESS','#define' : 'PREPROCESS','#undef' : 'PREPROCESS', - '#if' : 'PREPROCESS','#ifdef' : 'PREPROCESS','#ifndef' : 'PREPROCESS','#error' : 'PREPROCESS', - '__FILE__' : 'PREPROCESS','__LINE__' : 'PREPROCESS','__DATE__' : 'PREPROCESS', - '__TIME__' : 'PREPROCESS','__TIMESTAMP__' : 'PREPROCESS','pragma' : 'PREPROCESS', - '#' : 'PREPROCESS','##' : 'PREPROCESS'} +PREPROCESSOR_DIRECTIVES = {'#include' : 'TOKEN_PREPROCESS','#define' : 'TOKEN_PREPROCESS','#undef' : 'TOKEN_PREPROCESS', + '#if' : 'TOKEN_PREPROCESS','#ifdef' : 'TOKEN_PREPROCESS','#ifndef' : 'TOKEN_PREPROCESS','#error' : 'TOKEN_PREPROCESS', + '__FILE__' : 'TOKEN_PREPROCESS','__LINE__' : 'TOKEN_PREPROCESS','__DATE__' : 'TOKEN_PREPROCESS', + '__TIME__' : 'TOKEN_PREPROCESS','__TIMESTAMP__' : 'TOKEN_PREPROCESS','pragma' : 'TOKEN_PREPROCESS', + '#' : 'TOKEN_PREPROCESS','##' : 'TOKEN_PREPROCESS','#endif' : 'TOKEN_PREPROCESS'} -REGULAR = {'/*' : 'CSTART','*/' : 'CEND','=' : 'ASSIGN','==' : 'EQUAL', - '{' : 'LBRACE','}' : 'RBRACE','\+' : 'PLUS','-' : 'MINUS', - '\*' : 'MULT','/' : 'DIV','\(' : 'LPAREN','\)' : 'RPAREN', - ',' : 'COMMA',';' : 'SEMICOLON','\<' : 'LANGLE','\>' : 'RANGLE'} +REGULAR = {'/*' : 'TOKEN_CSTART','*/' : 'TOKEN_CEND', '*' : 'TOKEN_CMID', '=' : 'TOKEN_ASSIGN','==' : 'TOKEN_EQUAL', + '{' : 'TOKEN_LBRACE','}' : 'TOKEN_RBRACE','\+' : 'TOKEN_PLUS','-' : 'TOKEN_MINUS', + '\*' : 'TOKEN_MULT','/' : 'TOKEN_DIV','\(' : 'TOKEN_LPAREN','\)' : 'TOKEN_RPAREN', + ',' : 'TOKEN_COMMA',';' : 'TOKEN_SEMICOLON','\<' : 'TOKEN_LANGLE','\>' : 'TOKEN_RANGLE'} NASM_PREPROCESS_DIRECTIVES = {'#include' : '$include','#define' : '$define','#undef' : '$undef', - '#if' : '$if','#ifdef' : '$ifdef','#ifndef' : '$ifndef', + '#if' : '$if','#ifdef' : '$ifdef','#ifndef' : '%ifndef','#endif' : '%endif', '#error' : '$error','__FILE__' : '__FILE__','__LINE__' : '__LINE__', '__DATE__' : '__DATE__','__TIME__' : '__TIME__','__TIMESTAMP__' : '__TIMESTAMP__', 'pragma' : 'pragma','#' : '#','##' : '##'} - + +NASM_REGULAR = {'/*' : ';', '*' : ';', '*/' : '\n'} + TOKENS += RESERVED.values() -#Element type definitions. Used in the parse process. -ELEMENT_TYPE_UNKNOWN = -1 -ELEMENT_TYPE_DEFINE = 1 -ELEMENT_TYPE_INCLUDE = 2 -ELEMENT_TYPE_UNDEF = 3 -ELEMENT_TYPE_IFDEF = 4 -ELEMENT_TYPE_IFNDEF = 5 -ELEMENT_TYPE_IF = 6 -ELEMENT_TYPE_ELSE = 7 -ELEMENT_TYPE_ELIF = 8 -ELEMENT_TYPE_ENDIF = 9 -ELEMENT_TYPE_ERROR = 10 -ELEMENT_TYPE_PRAGMA = 11 -ELEMENT_TYPE_COMMENT_START = 20 -ELEMENT_TYPE_COMMENT_MULTILINE = 21 -ELEMENT_TYPE_COMMENT_END = 22 - -#Keyword : Element type dictionary, for read C-header line. -HDR_KEYWORDS = {'/*': ELEMENT_TYPE_COMMENT_START, - '*': ELEMENT_TYPE_COMMENT_MULTILINE, - '*/': ELEMENT_TYPE_COMMENT_END, - '#define': ELEMENT_TYPE_DEFINE, - '#include': ELEMENT_TYPE_INCLUDE, - '#undef': ELEMENT_TYPE_UNDEF, - '#ifdef': ELEMENT_TYPE_IFDEF, - '#ifndef': ELEMENT_TYPE_IFNDEF, - '#if': ELEMENT_TYPE_IF, - '#else': ELEMENT_TYPE_ELSE, - '#elif': ELEMENT_TYPE_ELIF, - '#endif': ELEMENT_TYPE_ENDIF, - '#error': ELEMENT_TYPE_ERROR, - '#pragma': ELEMENT_TYPE_PRAGMA} - -#Element type : keyword, for assembly include output file. -INC_KEYWORDS = {ELEMENT_TYPE_COMMENT_START: ';', - ELEMENT_TYPE_COMMENT_MULTILINE: ';', - ELEMENT_TYPE_COMMENT_END: '', - ELEMENT_TYPE_DEFINE: '%define', - ELEMENT_TYPE_INCLUDE: '%include', - ELEMENT_TYPE_UNDEF: '%undef', - ELEMENT_TYPE_IFDEF: '%ifdef', - ELEMENT_TYPE_IFNDEF: '%ifndef', - ELEMENT_TYPE_IF: '%if', - ELEMENT_TYPE_ELSE: '%else', - ELEMENT_TYPE_ELIF: '%elif', - ELEMENT_TYPE_ENDIF: '%endif', - ELEMENT_TYPE_ERROR: '%error', - ELEMENT_TYPE_PRAGMA: '%pragma'} +COMMENT_SINGLE_LINE = 0 +COMMENT_MULTI_LINE = 1 class PARSEOBJECT: + _passes = count(0) + def __init__(self): - self.tupline = [] - self.tupfile = [] + self.parseline = [] + self.parsefile = [] self.passes = 0 + def parse_reset(self): + self.parseline = [] + self.parsefile = [] + self._passes = count(0) + + def inc_passes(self): + self.passes = next(self._passes) + + def parseheader(self, fl): + self.parse_reset() + for l in fl: + analyzed_line = self.analyzer(l) + self.parsefile.append(analyzed_line) + self.inc_passes() + self.parsetokens(self.parsefile) + return self.parsefile + def parseinclude(self, data): tempstr = str(data) if tempstr.startswith('<'): @@ -98,39 +79,70 @@ class PARSEOBJECT: tempstr = tempstr.replace('.h', '.inc"') return tempstr - def lexer_get_token(k): - prep = keywords.preprocessor_directives - reg = keywords.regular + def tokenizer(self, w): token = "" - - if w in prep: - token = prep(w) - if w in reg: - token = reg(w) - - return token + if w in PREPROCESSOR_DIRECTIVES: + token = PREPROCESSOR_DIRECTIVES.get(w) + return token + if w in REGULAR: + token = REGULAR.get(w) + return token + return False def analyzer(self, ln): + analysed = [] word = [w for w in ln.split()] for w in word: - if w in HDR_KEYWORDS: - v = HDR_KEYWORDS[w] - self.tupline.append(v) + t = self.tokenizer(w) + if t == False: + analysed.append(w) + continue else: - self.tupline.append(w) - return self.tupline + analysed.append(t) + analysed.append(w) + return analysed - for l in self.tupfile: + def parsetokens(self, fl): + templine = [] + tempfile = [] + for l in fl: if len(l) == 0: continue - if l[0] == ELEMENT_TYPE_INCLUDE: - templine.append('%include'+' '+str(self.parseinclude(l[1]))) + if l[0] == "TOKEN_CSTART": + if l[-1] == "TOKEN_CEND": + tempfile.append(self.parse_comment(l,COMMENT_SINGLE_LINE)) + else: + tempfile.append(self.parse_comment(l,COMMENT_MULTI_LINE)) + if l[0] == "TOKEN_PREPROCESS": + tempfile.append(self.parse_preprocess(l)) + + def parse_comment(self, l, ct): + + + def parse_preprocess(self, l): + newline = [] + for e in l: + if e in TOKENS: + continue + for k, v in PREPROCESSOR_DIRECTIVES.items(): + if k == e: + newline.append(NASM_PREPROCESS_DIRECTIVES.get(k)) + break + break + if k == "#include": + newline.append(self.parseinclude(e) + else: + newline.append(e) + newline.append('\n') + return newline + class PARSER(PARSEOBJECT): _ids = count(0) + _passes = count(0) def __init__(self): self.id = next(self._ids) self.tupline = [] self.tupfile = [] - self.passes = 0 \ No newline at end of file + self.passes = next(self._passes) \ No newline at end of file