diff --git a/osaca/get_instr.py b/osaca/get_instr.py index 98f39bf..4c92c5e 100755 --- a/osaca/get_instr.py +++ b/osaca/get_instr.py @@ -1,228 +1,249 @@ #!/apps/python/3.5-anaconda/bin/python -import sys -import re +import sys, os, re +import argparse from testcase import Testcase from param import Register, MemAddr, Parameter -marker = r'//STARTLOOP' -asm_line = re.compile(r'\s[0-9a-f]+[:]') -numSeps = 0 -sem = 0 -db = {} -sorted_db = [] -lncnt = 1 -fname = "" -cntChar = '' -first = True -def extract_instr(asmFile): - global lncnt - global fname - fname = asmFile - # Check if parameter is in the correct file format - if(asmFile[-4:] != ".log"): - print("Invalid argument") - sys.exit() - # Open file - try: - f=open(asmFile, "r") - except IOError: - print("IOError: File not found") - # Analyse code line by line and check the instructions +class Instr_extractor(object): + + filepaths = [] + # Variables for checking lines + numSeps = 0 + sem = 0 + db = {} + sorted_db = [] lncnt = 1 - for line in f: - check_line(line) - lncnt += 1 - f.close() + fname = '' + cntChar = '' + first = True + # Constant variables + MARKER = r'//STARTLOOP' + ASM_LINE = re.compile(r'\s[0-9a-f]+[:]') + + def __init__(self, filepath): + self.filepaths = filepath + + def check_all(self): + for i in range(0,len(self.filepaths)): + self.extract_instr(self.filepaths[i]) + + def is_elffile(self, filepath): + if(os.path.isfile(filepath)): + with open(filepath) as f: + src = f.read() + if('format elf64' in src): + return True + return False + + + def extract_instr(self, asmFile): + fname = asmFile + # Check if parameter is in the correct file format + if(not self.is_elffile(asmFile)): + print('Invalid argument') + return + # Open file + try: + f=open(asmFile, 'r') + except IOError: + print('IOError: File not found') + # Analyse code line by line and check the instructions + self.lncnt = 1 + for line in f: + self.check_line(line) + self.lncnt += 1 + f.close() -def check_line(line): - global numSeps - global sem - global first - # Check if marker is in line and count the number of whitespaces if so - if(marker in line): - # But first, check if high level code ist indented with whitespaces or tabs - if(first): - set_counter_char(line) - first = False - numSeps = (re.split(marker,line)[0]).count(cntChar) - sem = 2; - elif(sem > 0): - # We're in the marked code snipped - # Check if the line is ASM code and - if not - check if we're still in the loop - match = re.search(asm_line, line) - if(match): - # Further analysis of instructions - # Check if there are commetns in line - if(r'//' in line): - return - check_instr("".join(re.split(r'\t',line)[-1:])) - elif((re.split(r'\S',line)[0]).count(cntChar) <= numSeps): - # Not in the loop anymore - or yet - so we decrement the semaphore - sem = sem-1 + def check_line(self, line): + # Check if MARKER is in line and count the number of whitespaces if so + if(self.MARKER in line): + # But first, check if high level code ist indented with whitespaces or tabs + if(self.first): + self.set_counter_char(line) + self.first = False + self.numSeps = (re.split(self.MARKER,line)[0]).count(self.cntChar) + self.sem = 2; + elif(self.sem > 0): + # We're in the marked code snipped + # Check if the line is ASM code and - if not - check if we're still in the loop + match = re.search(self.ASM_LINE, line) + if(match): + # Further analysis of instructions + # Check if there are commetns in line + if(r'//' in line): + return + self.check_instr(''.join(re.split(r'\t',line)[-1:])) + elif((re.split(r'\S',line)[0]).count(self.cntChar) <= self.numSeps): + # Not in the loop anymore - or yet - so we decrement the semaphore + self.sem = self.sem-1 -# Check if seperator is either tabulator or whitespace -def set_counter_char(line): - global cntChar - numSpaces = (re.split(marker,line)[0]).count(" ") - numTabs = (re.split(marker,line)[0]).count("\t") - if(numSpaces != 0 and numTabs == 0): - cntChar = ' ' - elif(numSpaces == 0 and numTabs != 0): - cntChar = '\t' - else: - err_msg = 'Indentation of code is only supported for whitespaces and tabs.' - raise NotImplementedError(err_msg) - - -def check_instr(instr): - global db - global lncnt - global cnt - global fname - # Check for strange clang padding bytes - while(instr.startswith("data32")): - instr = instr[7:] - # Seperate mnemonic and operands - mnemonic = instr.split()[0] - params = "".join(instr.split()[1:]) - # Check if line is not only a byte - empty_byte = re.compile(r'[0-9a-f]{2}') - if(re.match(empty_byte, mnemonic) and len(mnemonic) == 2): - return - # Check if there's one or more operand and store all in a list - param_list = flatten(separate_params(params)) - opList = list(param_list) - # Check operands and seperate them by IMMEDIATE (IMD), REGISTER (REG), MEMORY (MEM) or - # LABEL (LBL) - for i in range(len(param_list)): - op = param_list[i] - if(len(op) <= 0): - op = Parameter("NONE") - elif(op[0] == '$'): - op = Parameter("IMD") - elif(op[0] == '%' and '(' not in op): - j = len(op) - opmask = False - if('{' in op): - j = op.index('{') - opmask = True - op = Register(op[1:j], opmask) - elif('<' in op): - op = Parameter("LBL") + # Check if seperator is either tabulator or whitespace + def set_counter_char(self, line): + numSpaces = (re.split(self.MARKER,line)[0]).count(' ') + numTabs = (re.split(self.MARKER,line)[0]).count('\t') + if(numSpaces != 0 and numTabs == 0): + self.cntChar = ' ' + elif(numSpaces == 0 and numTabs != 0): + self.cntChar = '\t' else: - op = MemAddr(op) - param_list[i] = str(op) - opList[i] = op - # Join mnemonic and operand(s) to an instruction form - if(len(mnemonic) > 7): - tabs = "\t" - else: - tabs = "\t\t" - instr_form = mnemonic+tabs+(" ".join(param_list)) - # Check in database for instruction form and increment the counter - if(instr_form in db): - db[instr_form] = db[instr_form]+1 - else: - db[instr_form] = 1 - # Create testcase for instruction form, since it is the first appearance of it - # Only create benchmark if no label (LBL) is part of the operands - do_bench = True - for par in opList: - if(str(par) == 'LBL' or str(par) == ''): - do_bench = False - if(do_bench): - # Create testcase with reversed param list, due to the fact its intel syntax! - tc = Testcase(mnemonic, list(reversed(opList)), '64') - tc.write_testcase() + err_msg = 'Indentation of code is only supported for whitespaces and tabs.' + raise NotImplementedError(err_msg) -def separate_params(params): - param_list = [params] - if(',' in params): - if(')' in params): - if(params.index(')') < len(params)-1 and params[params.index(')')+1] == ','): - i = params.index(')')+1 - elif(params.index('(') < params.index(',')): - return param_list + def check_instr(self, instr): + # Check for strange clang padding bytes + while(instr.startswith('data32')): + instr = instr[7:] + # Seperate mnemonic and operands + mnemonic = instr.split()[0] + params = ''.join(instr.split()[1:]) + # Check if line is not only a byte + empty_byte = re.compile(r'[0-9a-f]{2}') + if(re.match(empty_byte, mnemonic) and len(mnemonic) == 2): + return + # Check if there's one or more operand and store all in a list + param_list = self.flatten(self.separate_params(params)) + opList = list(param_list) + # Check operands and seperate them by IMMEDIATE (IMD), REGISTER (REG), MEMORY (MEM) or + # LABEL (LBL) + for i in range(len(param_list)): + op = param_list[i] + if(len(op) <= 0): + op = Parameter('NONE') + elif(op[0] == '$'): + op = Parameter('IMD') + elif(op[0] == '%' and '(' not in op): + j = len(op) + opmask = False + if('{' in op): + j = op.index('{') + opmask = True + op = Register(op[1:j], opmask) + elif('<' in op): + op = Parameter('LBL') else: - i = params.index(',') - else: - i = params.index(',') - param_list = [params[:i],separate_params(params[i+1:])] - elif('#' in params): - i = params.index('#') - param_list = [params[:i]] - return param_list - - -def sort_db(): - global sorted_db - sorted_db=sorted(db.items(), key=lambda x:x[1], reverse=True) - - -def print_sorted_db(): - sort_db() - total = 0 - print("Number of\tmnemonic") - print("calls\n") - for i in range(len(sorted_db)): - print(str(sorted_db[i][1])+"\t\t"+sorted_db[i][0]) - total += sorted_db[i][1] - print("\nCumulated number of instructions: "+str(total)) - - -def save_db(): - file = open(".cnt_asm_ops.db","w") - for i in db.items(): - file.write(i[0]+"\t"+str(i[1])+"\n") - file.close() - - -def load_db(): - global db - try: - file = open(".cnt_asm_ops.db", "r") - except FileNotFoundError: - print("no database found in current directory") - return - for line in file: - mnemonic = line.split('\t')[0] + op = MemAddr(op) + param_list[i] = str(op) if (type(op) is not Register) else str(op)+str(op.size) + opList[i] = op # Join mnemonic and operand(s) to an instruction form if(len(mnemonic) > 7): - tabs = "\t" - params = line.split('\t')[1] - numCalls = line.split("\t")[2][:-1] + tabs = '\t' else: - tabs = "\t\t" - params = line.split('\t')[2] - numCalls = line.split("\t")[3][:-1] - instr_form = mnemonic+tabs+params - db[instr_form] = int(numCalls) - file.close() + tabs = '\t\t' + instr_form = mnemonic+tabs+(' '.join(param_list)) + # Check in database for instruction form and increment the counter + if(instr_form in self.db): + self.db[instr_form] = self.db[instr_form]+1 + else: + self. db[instr_form] = 1 + # Create testcase for instruction form, since it is the first appearance of it + # Only create benchmark if no label (LBL) is part of the operands + do_bench = True + for par in opList: + if(str(par) == 'LBL' or str(par) == ''): + do_bench = False + if(do_bench): + # Create testcase with reversed param list, due to the fact its intel syntax! + tc = Testcase(mnemonic, list(reversed(opList)), '64') + tc.write_testcase() -def flatten(l): - if l == []: - return l - if(isinstance(l[0], list)): - return flatten(l[0]) + flatten(l[1:]) - return l[:1] + flatten(l[1:]) + def separate_params(self, params): + param_list = [params] + if(',' in params): + if(')' in params): + if(params.index(')') < len(params)-1 and params[params.index(')')+1] == ','): + i = params.index(')')+1 + elif(params.index('(') < params.index(',')): + return param_list + else: + i = params.index(',') + else: + i = params.index(',') + param_list = [params[:i],self.separate_params(params[i+1:])] + elif('#' in params): + i = params.index('#') + param_list = [params[:i]] + return param_list + + + def sort_db(self): + self.sorted_db=sorted(self.db.items(), key=lambda x:x[1], reverse=True) -if __name__ == "__main__": - # load_db() - # r0 = Register("ymm0") - # r1 = Register("xmm0") - # r64 = Register("rax") - # r32 = Register("eax") - # mem0 = MemAddr('(%rax, %esi, 4)') - # tc = Testcase("XOR", [r32, r32], '64') - # tc.write_testcase() - # create_testcase("VADDPD", [r0, r0, r0]) - if(len(sys.argv) > 1): - for i in range(1,len(sys.argv)): - extract_instr(sys.argv[i]) - print_sorted_db() -# save_db() + def print_sorted_db(self): + self.sort_db() + total = 0 + print('Number of\tmnemonic') + print('calls\n') + for i in range(len(self.sorted_db)): + print(str(self.sorted_db[i][1])+'\t\t'+self.sorted_db[i][0]) + total += self.sorted_db[i][1] + print('\nCumulated number of instructions: '+str(total)) + + + def save_db(self): + file = open('.cnt_asm_ops.db','w') + for i in self.db.items(): + file.write(i[0]+'\t'+str(i[1])+'\n') + file.close() + + + def load_db(self): + try: + file = open('.cnt_asm_ops.db', 'r') + except FileNotFoundError: + print('no database found in current directory') + return + for line in file: + mnemonic = line.split('\t')[0] + # Join mnemonic and operand(s) to an instruction form + if(len(mnemonic) > 7): + tabs = '\t' + params = line.split('\t')[1] + numCalls = line.split('\t')[2][:-1] + else: + tabs = '\t\t' + params = line.split('\t')[2] + numCalls = line.split('\t')[3][:-1] + instr_form = mnemonic+tabs+params + self.db[instr_form] = int(numCalls) + file.close() + + + def flatten(self, l): + if l == []: + return l + if(isinstance(l[0], list)): + return self.flatten(l[0]) + self.flatten(l[1:]) + return l[:1] + self.flatten(l[1:]) + + +def main(): + # Parse args + parser = argparse.ArgumentParser(description='Returns a list of all instruction forms in the' + +'given files sorted by their number of occurences.') + parser.add_argument('-V', '--version', action='version', version='%(prog)s 0.2') + parser.add_argument('filepath', nargs='+', help='path to objdump(s)') + parser.add_argument('-l', '--load', dest='load', action='store_true', help='load database' + +' before checking new files') + parser.add_argument('-s', '--store', dest='store', action='store_true', help='store database ' + +'before checking new files') + + # Create object and store arguments as attribute + inp = parser.parse_args() + ie = Instr_extractor(inp.filepath) + + # Do work + if(inp.load): + ie.load_db() + ie.check_all() + ie.print_sorted_db() + if(inp.store): + ie.save_db() + +## ---------main method---------- +if __name__ == '__main__': + main()