diff --git a/tools/cpuBench/addAMDDocToXML.py b/tools/cpuBench/addAMDDocToXML.py new file mode 100755 index 0000000..a624aa1 --- /dev/null +++ b/tools/cpuBench/addAMDDocToXML.py @@ -0,0 +1,174 @@ +#!/usr/bin/python +from collections import namedtuple +import xml.etree.ElementTree as ET +from xml.dom import minidom +import argparse +import re +from openpyxl import load_workbook + +DocEntry = namedtuple('DocEntry', ['mnemonic', 'operands', 'ops', 'unit', 'lat', 'tp']) + +def main(): + parser = argparse.ArgumentParser(description="Add data to XML file from AMD's doc") + parser.add_argument('-xml') + parser.add_argument('-xlsx') + parser.add_argument('-outputXML') + parser.add_argument('-arch') + args = parser.parse_args() + + docEntrySet = set() + mnemonicMap = dict() + + wb = load_workbook(args.xlsx, data_only=True) + ws = wb.active + for row in ws.iter_rows(min_row=2, values_only=True): + mnemonicStr = row[0] + if not mnemonicStr: continue + + if mnemonicStr.endswith('cc'): + mnemonics = [mnemonicStr.replace('cc', p) for p in ['B', 'BE', 'L', 'LE', 'NB', 'NBE', 'NL', 'NLE', 'NO', 'NP', 'NS', 'NZ', 'O', 'P', 'S', 'Z']] + else: + mnemonics = mnemonicStr.replace(' (near)', '').replace('cc', '').split('/') + + for mnemonic in mnemonics: + if mnemonic in ['AAA', 'AAD', 'AAM', 'AAS', 'ARPL', 'BOUND', 'DAA', 'DAS', 'INTO', 'JCXZ', 'LDS', 'LES','POPA', 'POPAD', 'POPD', 'POPFD', 'PUSHA', 'PUSHAD', 'PUSHFD']: + # 32-bit instructions + continue + if mnemonic in ['CMPS', 'FCLEX', 'FINIT', 'FSAVE', 'FSTCW', 'FSTENV', 'FSTSW', 'INS', 'LODS', 'LOOPNZ', 'LOOPZ', 'MOVS', 'OUTS', 'PCLMULHQHQDQ', 'PCLMULHQLQDQ', 'PCLMULLQHQDQ', 'PCLMULLQLQDQ', 'RDPRU', 'SAL', 'SCAS', 'STOS', 'VGATHERDD', 'VGATHERDQ', 'VGATHERQD', 'VGATHERQQ','VPCLMULHQHQDQ', 'VPCLMULHQLQDQ', 'VPCLMULLQHQDQ', 'VPCLMULLQLQDQ', 'WAIT', 'XLATB']: + # missing in XED + continue + if mnemonic in ['INT1', 'JECXZ']: + # missing from XML file + continue + + operands = row[1:5] + + ops = row[7] + if ops == 'not supported': + continue + + unit = row[8] + lat = row[9] + tp = row[10] + + de = DocEntry(mnemonic, operands, ops, unit, lat, tp) + docEntrySet.add(de) + mnemonicMap.setdefault(mnemonic, []).append(de) + + iclassAsmDict = dict() + + root = ET.parse(args.xml).getroot() + for instrNode in root.iter('instruction'): + if instrNode.attrib.get('evex', '') == '1': + continue + if instrNode.attrib['extension'] == 'VAES': + continue + iclass = instrNode.attrib['iclass'] + asm = instrNode.attrib['asm'] + iclassAsmDict.setdefault(iclass, set()).add(instrNode) + iclassAsmDict.setdefault(re.sub('{.*} ', '', asm), set()).add(instrNode) + + #for x in set(op for de in docList for op in de.operands): + # print x + + xmlToDocDict = dict() + + for de in sorted(docEntrySet): + if de.mnemonic not in iclassAsmDict: + print 'no XML entry found for ' + str(de) + + xmlFound = False + for instrNode in iclassAsmDict[de.mnemonic]: + explXmlOperands = [op for op in instrNode.findall('./operand') if not op.attrib.get('suppressed', '') == '1' and not op.attrib.get('implicit', '') == '1'] + docOperands = [op for op in de.operands if op is not None] + + if (not docOperands and any(op.attrib['type'] == 'mem' for op in explXmlOperands) and + any(len(instrNode2.findall('./operand[@type="mem"]')) == 0 for instrNode2 in iclassAsmDict[de.mnemonic] if instrNode != instrNode2)): + continue + + if docOperands and explXmlOperands and (len(explXmlOperands) != len(docOperands)): + if any(len(explXmlOperands) == len([op for op in de2.operands if op is not None]) for de2 in mnemonicMap[de.mnemonic] if de!=de2): + continue + + if docOperands and explXmlOperands: + xmlOperands = explXmlOperands + else: + xmlOperands = [op for op in instrNode.findall('./operand')] + + invalid = False + for docOp, xmlOp in zip(docOperands, xmlOperands): + if de.mnemonic in ['CLZERO']: continue + if xmlOp.attrib['type'] == 'mem' and set(de.operands) == {None}: + invalid = True + break + if docOp is None: continue + if docOp in ['pntr16/mem16:16/32']: continue + + if xmlOp.attrib['type'] == 'reg': + if docOp == 'segmentReg': + if xmlOp.attrib.get('implicit', '') == '1': continue + elif docOp in ['reg', 'reg/mem'] and xmlOp.attrib.get('implicit', '') != '1': continue + elif not 'MM' in xmlOp.text: + if docOp == 'Sti' and xmlOp.text.startswith('ST'): continue + if docOp == 'ax' and xmlOp.text == 'AX': continue + if 'width' in xmlOp.attrib and re.search('reg(\d+/)*' + xmlOp.attrib['width'], docOp) is not None: continue + else: + if 'mmx' in docOp and xmlOp.text.startswith('MM'): continue + if 'xmm' in docOp and xmlOp.text.startswith('XMM'): continue + if 'ymm' in docOp and xmlOp.text.startswith('YMM'): continue + elif xmlOp.attrib['type'] == 'mem': + if docOp in ['mem', 'reg/mem', 'xmm2/mem', 'vm32x']: continue + if re.search('mem(\d+/)*' + xmlOp.attrib['width'], docOp) is not None: continue + elif xmlOp.attrib['type'] in ['imm', 'relbr']: + if docOp in ['imm', 'imm`', 'CL/Imm', 'xmm3/imm']: continue + if re.search('imm(\d+/)*' + xmlOp.attrib['width'], docOp) is not None: continue + + invalid = True + + if invalid: + continue + + if instrNode in xmlToDocDict: + if (set(de.operands) != {None}) and (set(xmlToDocDict[instrNode].operands) == {None}): + xmlFound = True + xmlToDocDict[instrNode] = de + elif (set(de.operands) == {None}) and (set(xmlToDocDict[instrNode].operands) != {None}): + pass + else: + print 'duplicate entry for ' + instrNode.attrib['string'] + ' found: ' + str(list(xmlToDocDict[instrNode])) + ', ' + str(list(de)) + else: + xmlFound = True + xmlToDocDict[instrNode] = de + + if not xmlFound: + print 'no matching XML entry found for ' + str(de) + + print 'Found data for ' + str(len(xmlToDocDict)) + ' instruction variants' + + for instrNode, de in xmlToDocDict.items(): + archNode = instrNode.find('./architecture[@name="{}"]'.format(args.arch)) + if archNode is None: + archNode = ET.SubElement(instrNode, "architecture") + archNode.attrib['name'] = args.arch + + docNode = ET.SubElement(archNode, "doc") + if de.ops: docNode.attrib['uops'] = str(de.ops) + if de.unit: docNode.attrib['ports'] = str(de.unit) + if de.lat and de.lat != '-': docNode.attrib['latency'] = str(de.lat) + if de.tp: + try: + if str(de.tp) == '0.33': + docNode.attrib['TP'] = '3.00' + else: + docNode.attrib['TP'] = format(1/float(de.tp), '.2f') + except ValueError: + docNode.attrib['TP'] = de.tp + + with open(args.outputXML, "w") as f: + rough_string = ET.tostring(root, 'utf-8') + reparsed = minidom.parseString(rough_string) + f.write('\n'.join([line for line in reparsed.toprettyxml(indent=' '*2).split('\n') if line.strip()])) + + +if __name__ == "__main__": + main() diff --git a/tools/cpuBench/addDocToXML.py b/tools/cpuBench/addDocToXML.py new file mode 100755 index 0000000..bb78144 --- /dev/null +++ b/tools/cpuBench/addDocToXML.py @@ -0,0 +1,68 @@ +#!/usr/bin/python +from collections import namedtuple +import xml.etree.ElementTree as ET +from xml.dom import minidom +import argparse + +DocEntry = namedtuple('DocEntry', ['iform', 'regsize', 'mask', 'tp', 'lat']) + +def main(): + parser = argparse.ArgumentParser(description="Add data to XML file from Intel's CSV doc") + parser.add_argument('-xml') + parser.add_argument('-csv') + parser.add_argument('-outputXML') + parser.add_argument('-arch') + args = parser.parse_args() + + docDict = dict() + + with open(args.csv, 'r') as f: + for i, line in enumerate(f): + if i > 0: + de = DocEntry(*line.strip().split(',')) + docDict.setdefault(de.iform, []).append(de) + + root = ET.parse(args.xml).getroot() + + + for instrNode in root.iter('instruction'): + iform = instrNode.attrib['iform'] + + if iform in docDict: + matchingDEs = set(docDict[iform]) + + if len(matchingDEs) > 1: + for de in list(matchingDEs): + if de.regsize != '-': + if not instrNode.findall('./operand[@type="reg"][@width="{}"]'.format(de.regsize)): + matchingDEs.remove(de) + + for de in list(matchingDEs): + if 'mask' in instrNode.attrib: + if (instrNode.attrib['mask'] == '1' and de.mask == 'no') or (instrNode.attrib['mask'] == '0' and de.mask == 'yes'): + matchingDEs.remove(de) + + if len(matchingDEs) == 0: + print 'No matching iform: ' + iform + elif len(matchingDEs) > 1: + print 'Multiple matching iforms: ' + iform + else: + de = next(iter(matchingDEs)) + + archNode = instrNode.find('./architecture[@name="{}"]'.format(args.arch)) + if archNode is None: + archNode = ET.SubElement(instrNode, "architecture") + archNode.attrib['name'] = args.arch + + docNode = ET.SubElement(archNode, "doc") + if de.tp: docNode.attrib['TP'] = de.tp + if de.lat: docNode.attrib['latency'] = de.lat + + with open(args.outputXML, "w") as f: + rough_string = ET.tostring(root, 'utf-8') + reparsed = minidom.parseString(rough_string) + f.write('\n'.join([line for line in reparsed.toprettyxml(indent=' '*2).split('\n') if line.strip()])) + + +if __name__ == "__main__": + main() diff --git a/tools/cpuBench/compareMeasurementsToOther.py b/tools/cpuBench/compareMeasurementsToOther.py new file mode 100755 index 0000000..9175013 --- /dev/null +++ b/tools/cpuBench/compareMeasurementsToOther.py @@ -0,0 +1,211 @@ +#!/usr/bin/python +import xml.etree.ElementTree as ET +import argparse +import sys +from utils import * + +def main(): + parser = argparse.ArgumentParser(description='Compare results') + parser.add_argument("-input", help="Input XML file", default='result.xml') + parser.add_argument("-arch", help="Consider only this architecture") + parser.add_argument("-ignoreLockRep", help="Ignore Instructions with lock and rep prefixes", action='store_true') + parser.add_argument("-verbose", help="Verbose mode", action='store_true') + args = parser.parse_args() + + root = ET.parse(args.input) + + instrArchNodes = [] + for instrNode in root.iter('instruction'): + if args.ignoreLockRep and ('LOCK_' in instrNode.attrib['iform'] or 'REP_' in instrNode.attrib['iform']): continue + archNode = instrNode.find('./architecture[@name="{}"]'.format(args.arch)) + if archNode is not None: + instrArchNodes.append((instrNode, archNode)) + + nPortsMeasurementOnly = 0 + nPortsOtherOnly = 0 + nPortsBoth = 0 + nPortsEq = 0 + nPortsDiff = 0 + + nUopsMeasurementOnly = 0 + nUopsOtherOnly = 0 + nUopsBoth = 0 + nUopsEq = 0 + nUopsEqPortsEq = 0 + nUopsEqPortsDiff = 0 + nUopsDiff = 0 + + nLatMeasurementOnly = 0 + nLatOtherOnly = 0 + nLatBoth = 0 + nLatUB = 0 + nLatUBCorrect = 0 + nLatUBExact = 0 + nLatUBClose = 0 + nLatUBIncorrect = 0 + nLatNoUB = 0 + nLatNoUBMaxEq = 0 + nLatNoUBMaxDiff = 0 + + for instrNode, archNode in instrArchNodes: + measurementNode = archNode.find('measurement') + nonMeasurementNodes = archNode.findall('./IACA') + archNode.findall('doc') + + otherPorts = [v for m in nonMeasurementNodes for a,v in m.attrib.items() if a.startswith('ports')] + mPorts = ([v for a, v in measurementNode.attrib.items() if a.startswith('ports')] if measurementNode is not None else []) + + portsEq = False + portsDiff = False + + if mPorts: + if otherPorts: + nPortsBoth += 1 + if any(m in otherPorts for m in mPorts): + portsEq = True + nPortsEq += 1 + else: + portsDiff = True + nPortsDiff += 1 + if args.verbose: print 'PortsDiff: {} - {} - {}'.format(instrNode.attrib['string'], mPorts, otherPorts) + else: + nPortsMeasurementOnly += 1 + else: + if otherPorts: + nPortsOtherOnly += 1 + if args.verbose: print 'PortsOtherOnly: ' + instrNode.attrib['string'] + + otherUops = [v for m in nonMeasurementNodes for a,v in m.attrib.items() if a.startswith('uops') and v.replace('.','',1).isdigit()] + mUops = ([v for a,v in measurementNode.attrib.items() if a.startswith('uops') and not 'retire_slots' in a] if measurementNode is not None else []) + + if mUops: + if otherUops: + nUopsBoth += 1 + if any(m in otherUops for m in mUops): + nUopsEq += 1 + nUopsEqPortsEq += int(portsEq) + nUopsEqPortsDiff += int(portsDiff) + else: + nUopsDiff += 1 + if args.verbose: print 'UopsDiff: {} - {} - {}'.format(instrNode.attrib['string'], mUops, otherUops) + else: + nUopsMeasurementOnly += 1 + else: + if otherUops: + nUopsOtherOnly += 1 + if args.verbose: print 'UopsOtherOnly: ' + instrNode.attrib['string'] + + + otherLatencies = [float(v) for m in nonMeasurementNodes for a,v in m.attrib.items() if a.startswith('latency') and v.replace('.','',1).isdigit()] + + latEntry = getLatencyTableEntry(measurementNode) + if latEntry is not None: + if otherLatencies: + nLatBoth += 1 + _, _, _, maxLat, maxLatUB = latEntry + if maxLatUB: + nLatUB += 1 + if any(x for x in otherLatencies if x <= maxLat): + nLatUBCorrect += 1 + if maxLat in otherLatencies: + nLatUBExact += 1 + diff = min(abs(float(maxLat)-float(o)) for o in otherLatencies) + if diff <= 1.01: + nLatUBClose += 1 + else: + nLatUBIncorrect += 1 + if args.verbose: print 'LatUBIncorrect: {} - {} - {}'.format(instrNode.attrib['string'], maxLat, otherLatencies) + else: + nLatNoUB += 1 + if maxLat in otherLatencies: + nLatNoUBMaxEq += 1 + else: + nLatNoUBMaxDiff += 1 + if args.verbose: print 'LatNoUBMaxDiff: {} - {} - {}'.format(instrNode.attrib['string'], maxLat, otherLatencies) + else: + nLatMeasurementOnly += 1 + else: + if otherLatencies: + nLatOtherOnly += 1 + if args.verbose: print 'LatOtherOnly: ' + instrNode.attrib['string'] + + print 'Ports:' + print ' Measurement data only: ' + str(nPortsMeasurementOnly) + print ' Other data only: ' + str(nPortsOtherOnly) + print ' Both: ' + str(nPortsBoth) + print ' Eq: ' + str(nPortsEq) + print ' Diff: ' + str(nPortsDiff) + print '' + + print 'Uops:' + print ' Measurement data only: ' + str(nUopsMeasurementOnly) + print ' Other data only: ' + str(nUopsOtherOnly) + print ' Both: ' + str(nUopsBoth) + print ' Eq: ' + str(nUopsEq) + print ' PortsEq: ' + str(nUopsEqPortsEq) + print ' PortsDiff: ' + str(nUopsEqPortsDiff) + print ' Diff: ' + str(nUopsDiff) + print '' + + print 'Latency:' + print ' Measurement data only: ' + str(nLatMeasurementOnly) + print ' Other data only: ' + str(nLatOtherOnly) + print ' Both: ' + str(nLatBoth) + print ' Exact: ' + str(nLatNoUB) + print ' Eq (Max): ' + str(nLatNoUBMaxEq) + print ' Diff (Max): ' + str(nLatNoUBMaxDiff) + print ' Upper Bound: ' + str(nLatUB) + print ' Correct: ' + str(nLatUBCorrect) + print ' Exact: ' + str(nLatUBExact) + print ' Close: ' + str(nLatUBClose) + print ' Incorrect: ' + str(nLatUBIncorrect) + print '' + + print 'Throughput:' + for TP_m, TP_o in [('TP', 'TP'), ('TP_ports', 'TP'), ('TP', 'TP_ports'), ('TP_ports', 'TP_ports')]: + nTPMeasurementOnly = 0 + nTPOtherOnly = 0 + nTPBoth = 0 + nTPEq = 0 + nTPDiff = 0 + nTPClose = 0 + nTPNotClose = 0 + + for instrNode, archNode in instrArchNodes: + measurementNode = archNode.find('measurement') + nonMeasurementNodes = archNode.findall('./IACA') + archNode.findall('doc') + + otherTPs = [float(v) for m in nonMeasurementNodes for a,v in m.attrib.items() if a in [TP_o, TP_o+'_same_reg'] and v.replace('.','',1).isdigit()] + mTPs = ([float(v) for a, v in measurementNode.attrib.items() if a in [TP_m, TP_m+'_same_reg']] if measurementNode is not None else []) + + if mTPs: + if otherTPs: + nTPBoth += 1 + if any(m in otherTPs for m in mTPs): + nTPEq += 1 + else: + nTPDiff += 1 + if args.verbose: print 'TPDiff ({} (measurements) - {} (other)): {} - {} - {}'.format(TP_m, TP_o, instrNode.attrib['string'], mTPs, otherTPs) + diff = min(abs(float(m)-float(o)) for o in otherTPs for m in mTPs) + if diff <= .1: + nTPClose += 1 + else: + nTPNotClose += 1 + if args.verbose: print 'TPNotClose ({} (measurements) - {} (other)): {} - {} - {}'.format(TP_m, TP_o, instrNode.attrib['string'], mTPs, otherTPs) + else: + nTPMeasurementOnly += 1 + else: + if otherTPs: + nTPOtherOnly += 1 + if args.verbose: print 'TPOtherOnly ({} (measurements) - {} (other)): {}'.format(TP_m, TP_o, instrNode.attrib['string']) + + print ' {} (measurements) - {} (other):'.format(TP_m, TP_o) + print ' Measurement data only: ' + str(nTPMeasurementOnly) + print ' Other data only: ' + str(nTPOtherOnly) + print ' Both: ' + str(nTPBoth) + print ' Eq: ' + str(nTPEq) + print ' Diff: ' + str(nTPDiff) + print ' Close: ' + str(nTPClose) + print ' NotClose: ' + str(nTPNotClose) + +if __name__ == "__main__": + main() diff --git a/tools/cpuBench/compareXML.py b/tools/cpuBench/compareXML.py new file mode 100755 index 0000000..94e360f --- /dev/null +++ b/tools/cpuBench/compareXML.py @@ -0,0 +1,81 @@ +#!/usr/bin/python +import xml.etree.ElementTree as ET +from xml.dom import minidom +import argparse + +# Shows the differences between two XML files for a specific microarchitecture +def main(): + parser = argparse.ArgumentParser(description='Compare XML files') + parser.add_argument('inp1') + parser.add_argument('arch1') + parser.add_argument('inp2') + parser.add_argument('arch2') + parser.add_argument('-TP', action='store_true') + parser.add_argument('-lat', action='store_true') + parser.add_argument('-ports', action='store_true') + args = parser.parse_args() + + root1 = ET.parse(args.inp1).getroot() + root2 = ET.parse(args.inp2).getroot() + + instrNodeDict1 = {instrNode.attrib['string']: instrNode for instrNode in root1.iter('instruction')} + instrNodeDict2 = {instrNode.attrib['string']: instrNode for instrNode in root2.iter('instruction')} + + tpDiff = 0 + latDiff = 0 + portsDiff = 0 + + for instrStr in sorted(instrNodeDict1): + instrNode1 = instrNodeDict1[instrStr] + if not instrStr in instrNodeDict2: + print 'No matching entry found for ' + instrStr + continue + instrNode2 = instrNodeDict2[instrStr] + for mNode1 in instrNode1.findall('./architecture[@name="' + args.arch1 + '"]/measurement'): + for mNode2 in instrNode2.findall('./architecture[@name="' + args.arch2 + '"]/measurement'): + if args.TP: + tp1 = mNode1.attrib['TP'] + tp2 = mNode2.attrib['TP'] + if tp1 != tp2: + tpDiff += 1 + print instrStr + ' - TP1: ' + tp1 + ' - TP2: ' + tp2 + + if args.lat: + for latNode1, latNode2 in zip(mNode1.findall('./latency'), mNode2.findall('./latency')): + latStr1 = ET.tostring(latNode1, encoding='utf-8').strip() + latStr2 = ET.tostring(latNode2, encoding='utf-8').strip() + if latStr1 != latStr2: + latDiff += 1 + print instrStr + print ' ' + latStr1 + print ' ' + latStr2 + + if args.ports: + p1 = mNode1.attrib.get('ports', '') + p2 = mNode2.attrib.get('ports', '') + if p1 != p2: + portsDiff += 1 + print instrStr + ' - P1: ' + p1 + ' - P2: ' + p2 + + if not args.TP and not args.lat and not args.ports: + xmlStr1 = ET.tostring(mNode1, encoding='utf-8').strip() + xmlStr2 = ET.tostring(mNode2, encoding='utf-8').strip() + + if xmlStr1 != xmlStr2: + print '-------------------------------' + print instrStr + print xmlStr1 + print xmlStr2 + print '-------------------------------' + + if args.TP: + print 'TPDiff: ' + str(tpDiff) + + if args.lat: + print 'LatDiff: ' + str(latDiff) + + if args.ports: + print 'portsDiff: ' + str(portsDiff) + +if __name__ == "__main__": + main() diff --git a/tools/cpuBench/cpuBench.py b/tools/cpuBench/cpuBench.py new file mode 100755 index 0000000..4401bf3 --- /dev/null +++ b/tools/cpuBench/cpuBench.py @@ -0,0 +1,3099 @@ +#!/usr/bin/python +import xml.etree.ElementTree as ET +from xml.etree.ElementTree import Element, SubElement, Comment, tostring +from xml.dom import minidom +from itertools import groupby, cycle, islice, chain +from collections import namedtuple, OrderedDict + +import argparse +import copy +import datetime +import math +import os +import re +import subprocess +import sys +import logging +import pickle +import shutil +import tarfile + +from utils import * + +sys.path.append('../..') +from kernelNanoBench import * + +sys.path.append('../CPUID') +import cpuid + +useIACA=False +iacaCMDLine = '' +iacaVersion = '' +arch = '' +debugOutput = False +supportsAVX = False +instrNodeList = [] # list of all XML instruction nodes that are not filtered out +instrNodeDict = {} # dict from instrNode.attrib['string'] to instrNode + +globalDoNotWriteRegs = {'R13', 'R13D', 'R13W', 'R13B', 'R14', 'R14D', 'R14W', 'R14B', 'R15', 'R15D', 'R15W', 'R15B', 'SP', 'SPL', 'ESP', 'RSP', 'XMM13', 'YMM13', 'ZMM13', 'XMM14', 'YMM14', 'ZMM14', 'XMM15', 'YMM15', 'ZMM15', 'MM15', 'IP', 'DR4', 'DR5', 'DR6', 'DR7', 'RBP', 'EBP', 'BP', 'K0'} #ToDo +#R14: reserved for memory addresses +#R13: can be written in init; will not be overwritten by other code +#R15: loop counter + +specialRegs = {'ES', 'CS', 'SS', 'DS', 'FS', 'GS', 'IP', 'EIP', 'FSBASEy', 'GDTR', 'GSBASEy', 'IDTR', 'IP', 'LDTR', 'MSRS', 'MXCSR', 'RFLAGS', 'RIP', + 'TR', 'TSC', 'TSCAUX', 'X87CONTROL', 'X87POP', 'X87POP2', 'X87PUSH', 'X87STATUS', 'X87TAG', 'XCR0', 'XMM0dq', 'CR0', 'CR2', 'CR3', 'CR4', 'CR8', 'ERROR', + 'BND0', 'BND1', 'BND2', 'BND3'} + +GPRRegs = {'AH', 'AL', 'AX', 'BH', 'BL', 'BP', 'BPL', 'BX', 'CH', 'CL', 'CX', 'DH', 'DI', 'DIL', 'DL', 'DX', 'EAX', + 'EBP', 'EBX', 'ECX', 'EDI', 'EDX', 'ESI', 'ESP', 'R10', 'R10B', 'R10D', 'R10W', 'R11', 'R11B', 'R11D', 'R11W', 'R12', + 'R12B', 'R12D', 'R12W', 'R13', 'R13B', 'R13D', 'R13W', 'R14', 'R14B', 'R14D', 'R14W', 'R15', 'R15B', 'R15D', 'R15W', + 'R8', 'R8B', 'R8D', 'R8W', 'R9', 'R9B', 'R9D', 'R9W', 'RAX', 'RBP', 'RBX', 'RCX', 'RDI', 'RDX', 'RSI', 'RSP', 'SI', + 'SIL', 'SP', 'SPL'} + +STATUSFLAGS = {'CF', 'PF', 'AF', 'ZF', 'SF', 'OF'} +STATUSFLAGS_noAF = {'CF', 'PF', 'ZF', 'SF', 'OF'} + +maxTPRep = 16 + +#iforms of serializing and memory-ordering instructions according to Ch. 8.3 of the Intel manual +serializingInstructions = {'INVD', 'INVEPT', 'INVLPG', 'INVVPID', 'LGDT', 'LIDT', 'LLDT', 'LTR', 'MOV_CR_CR_GPR64', 'MOV_DR_DR_GPR64', 'WBINVD', 'WRMSR', + 'CPUID', 'IRET', 'RSM', 'SFENCE', 'LFENCE', 'MFENCE'} + +def isAMDCPU(): + return arch in ['ZEN+', 'ZEN2'] + +def isIntelCPU(): + return not isAMDCPU() + +def regTo64(reg): + if 'AX' in reg or 'AH' in reg or 'AL' in reg: return 'RAX' + if 'BX' in reg or 'BH' in reg or 'BL' in reg: return 'RBX' + if 'CX' in reg or 'CH' in reg or 'CL' in reg: return 'RCX' + if 'DX' in reg or 'DH' in reg or 'DL' in reg: return 'RDX' + if 'SP' in reg: return 'RSP' + if 'BP' in reg: return 'RBP' + if 'SI' in reg: return 'RSI' + if 'DI' in reg: return 'RDI' + if '8' in reg: return 'R8' + if '9' in reg: return 'R9' + if '10' in reg: return 'R10' + if '11' in reg: return 'R11' + if '12' in reg: return 'R12' + if '13' in reg: return 'R13' + if '14' in reg: return 'R14' + if '15' in reg: return 'R15' + +def regTo32(reg): + if 'AX' in reg or 'AH' in reg or 'AL' in reg: return 'EAX' + if 'BX' in reg or 'BH' in reg or 'BL' in reg: return 'EBX' + if 'CX' in reg or 'CH' in reg or 'CL' in reg: return 'ECX' + if 'DX' in reg or 'DH' in reg or 'DL' in reg: return 'EDX' + if 'SP' in reg: return 'ESP' + if 'BP' in reg: return 'EBP' + if 'SI' in reg: return 'ESI' + if 'DI' in reg: return 'EDI' + if '8' in reg: return 'R8D' + if '9' in reg: return 'R9D' + if '10' in reg: return 'R10D' + if '11' in reg: return 'R11D' + if '12' in reg: return 'R12D' + if '13' in reg: return 'R13D' + if '14' in reg: return 'R14D' + if '15' in reg: return 'R15D' + +def regTo16(reg): + if 'AX' in reg or 'AH' in reg or 'AL' in reg: return 'AX' + if 'BX' in reg or 'BH' in reg or 'BL' in reg: return 'BX' + if 'CX' in reg or 'CH' in reg or 'CL' in reg: return 'CX' + if 'DX' in reg or 'DH' in reg or 'DL' in reg: return 'DX' + if 'SP' in reg: return 'SP' + if 'BP' in reg: return 'BP' + if 'SI' in reg: return 'SI' + if 'DI' in reg: return 'DI' + if '8' in reg: return 'R8W' + if '9' in reg: return 'R9W' + if '10' in reg: return 'R10W' + if '11' in reg: return 'R11W' + if '12' in reg: return 'R12W' + if '13' in reg: return 'R13W' + if '14' in reg: return 'R14W' + if '15' in reg: return 'R15W' + +def regTo8(reg): + if 'AX' in reg or 'AH' in reg or 'AL' in reg: return 'AL' + if 'BX' in reg or 'BH' in reg or 'BL' in reg: return 'BL' + if 'CX' in reg or 'CH' in reg or 'CL' in reg: return 'CL' + if 'DX' in reg or 'DH' in reg or 'DL' in reg: return 'DL' + if 'SP' in reg: return 'SPL' + if 'BP' in reg: return 'BPL' + if 'SI' in reg: return 'SIL' + if 'DI' in reg: return 'DIL' + if '8' in reg: return 'R8B' + if '9' in reg: return 'R9B' + if '10' in reg: return 'R10B' + if '11' in reg: return 'R11B' + if '12' in reg: return 'R12B' + if '13' in reg: return 'R13B' + if '14' in reg: return 'R14B' + if '15' in reg: return 'R15B' + +def regToSize(reg, size): + if size == 8: return regTo8(reg) + elif size == 16: return regTo16(reg) + elif size == 32: return regTo32(reg) + else: return regTo64(reg) + +# Returns a set of registers that are a part of the register that is provided (e.g., EAX is a part of RAX; RAX is also a part of RAX) +def getSubRegs(reg): + subRegs = set() + subRegs.add(reg) + if reg in GPRRegs: + regSize = getRegSize(reg) + if regSize > 8: + for size in [16, 32, 64]: + if size > regSize: continue + subRegs.add(regToSize(reg, size)) + if 'AX' in reg or 'BX' in reg or 'CX' in reg or 'DX' in reg: + subRegs.add(reg[-2] + 'L') + subRegs.add(reg[-2] + 'H') + else: + subRegs.add(regTo8(reg)) + elif 'ZMM' in reg: + subRegs.add('Y' + reg[1:]) + subRegs.add('X' + reg[1:]) + elif 'YMM' in reg: + subRegs.add('X' + reg[1:]) + return subRegs + +# Returns for a GPR the corresponding 64-bit registers, and for a (X|Y|Z)MM register the corresponding XMM register +def getCanonicalReg(reg): + if reg in GPRRegs: + return regTo64(reg) + elif 'MM' in reg: + return re.sub('^[YZ]', 'X', reg) + else: + return reg + +def getRegForMemPrefix(reg, memPrefix): + return regToSize(reg, getSizeOfMemPrefix(memPrefix)) + +def getSizeOfMemPrefix(memPrefix): + if 'zmmword' in memPrefix: return 512 + elif 'ymmword' in memPrefix: return 256 + elif 'xmmword' in memPrefix: return 128 + elif 'qword' in memPrefix: return 64 + elif 'dword' in memPrefix: return 32 + elif 'word' in memPrefix: return 16 + elif 'byte' in memPrefix: return 8 + else: return -1 + +def getRegSize(reg): + if reg[-1] == 'L' or reg[-1] == 'H' or reg[-1] == 'B': return 8 + elif reg[-1] == 'W' or reg in ['AX', 'BX', 'CX', 'DX', 'SP', 'BP' 'SI', 'DI']: return 16 + elif reg[0] == 'E' or reg[-1] == 'D': return 32 + elif reg in GPRRegs: return 64 + elif reg.startswith('MM'): return 64 + elif reg.startswith('XMM'): return 128 + elif reg.startswith('YMM'): return 256 + elif reg.startswith('ZMM'): return 512 + else: return -1 + +def getAddrReg(instrNode, opNode): + if opNode.attrib.get('suppressed', '0') == '1': + return opNode.attrib['base'] + elif instrNode.attrib.get('rex', '1') == '0': + return 'RDI' + else: + return 'R14' + +# registers that are not used as implicit registers should come first; RAX (and parts of it) should come last, as some instructions have special encodings for that +# prefer low registers to high registers +def sortRegs(regsList): + return sorted(regsList, key=lambda r: (not any(i.isdigit() for i in r), 'H' in r, 'A' in r, map(int, re.findall('\d+',r)), r)) + + +# Initialize registers and memory +def getRegMemInit(instrNode, opRegDict, memOffset): + iform = instrNode.attrib['iform'] + iclass = instrNode.attrib['iclass'] + + init = [] + + if iform == 'CLZERO': init += ['MOV RAX, R14'] + if iclass == 'LDMXCSR': init += ['STMXCSR [R14+' + str(memOffset) + ']'] + if iclass == 'VLDMXCSR': init += ['VSTMXCSR [R14+' + str(memOffset) + ']'] + if iform == 'LGDT_MEMs64': init += ['SGDT [R14+' + str(memOffset) + ']'] + if iform == 'LIDT_MEMs64': init += ['SIDT [R14+' + str(memOffset) + ']'] + if iform == 'LLDT_MEMw': init += ['SLDT [R14+' + str(memOffset) + ']'] + if iform == 'XLAT': init += ['MOV RBX, R14', 'mov qword ptr [RBX], 0'] + + if (isSSEInstr(instrNode) or isAVXInstr(instrNode)) and supportsAVX: + # Zero upper bits to avoid AVX-SSE transition penalties; also, e.g., dep. breaking and zero-latency instructions do not seem to work otherwise + # we use vzeroall instead of just vzeroupper to make sure that XMM14 is 0 for VSIB addressing + init += ['VZEROALL'] + + if not 'DIV' in instrNode.attrib['iclass'] and not 'SQRT' in instrNode.attrib['iclass']: + for opNode in instrNode.findall('./operand[@r="1"]'): + opIdx = int(opNode.attrib['idx']) + xtype = opNode.attrib.get('xtype', '') + + if opNode.attrib['type'] == 'reg': + reg = opRegDict[opIdx] + regPrefix = re.sub('\d', '', reg) + + if 'MM' in regPrefix and xtype.startswith('f'): + init += ['MOV RAX, 0x4000000040000000'] + for i in range(0, getRegSize(reg)/8, 8): init += ['MOV [R14+' + str(i) + '], RAX'] + + if isAVXInstr(instrNode): + init += ['VMOVUPD ' + reg + ', [R14]'] + else: + init += ['MOVUPD ' + reg + ', [R14]'] + elif regPrefix in ['XMM', 'YMM', 'ZMM'] and isAVXInstr(instrNode): + # some AVX instr. (e.g. VORPS, VAESDEC) incur a penalty (?) if a source was not written by an AVX instr. of a similar kind + if reg not in globalDoNotWriteRegs: + for opNode2 in instrNode.findall('./operand[@w="1"]'): + if not opNode2.text == opNode.text: continue + init += [getInstrInstanceFromNode(instrNode, opRegDict={int(opNode2.attrib['idx']):reg}, computeRegMemInit=False).asm] + break + else: + init += ['VXORPS '+reg+', '+reg+', '+reg] + elif 'MM' in regPrefix: + init += ['PXOR '+reg+', '+reg] + elif opNode.attrib['type'] == 'mem': + if 'VSIB' in opNode.attrib: + vsibReg = opNode.attrib['VSIB'] + '14' + init += ['VXORPS ' + vsibReg + ', ' + vsibReg + ', ' + vsibReg] + if xtype.startswith('f'): + init += ['MOV RAX, 0x4000000040000000'] + for i in range(0, int(opNode.attrib['width'])/8, 8): init += ['MOV [R14+' + str(i+memOffset) + '], RAX'] + + return init + +nExperiments = 0 +def runExperiment(instrNode, instrCode, init=None, unrollCount=500, loopCount=0, warmUpCount=10, basicMode=False, htmlReports=None, maxRepeat=1): + # we use a default warmUpCount of 10, as ICL requires at least about that much before memory operations run at full speed + + if init is None: init = [] + localHtmlReports = [] + + global nExperiments + nExperiments += 1 + + instrCode = re.sub(';+', '; ', instrCode.strip('; ')) + if debugOutput: print 'instr: ' + instrCode + codeObjFile = '/tmp/ramdisk/code.o' + assemble(instrCode, codeObjFile, asmFile='/tmp/ramdisk/code.s') + localHtmlReports.append('
  • Code:
    ' + getMachineCode(codeObjFile) + '
  • \n') + + init = list(OrderedDict.fromkeys(init)) # remove duplicates while maintaining the order + initCode = '; '.join(init) + + if instrNode is not None and (instrNode.attrib.get('vex', '') == '1' or instrNode.attrib.get('evex', '') == '1'): + # vex and evex encoded instructions need a warm-up period before memory reads operate at full speed; + # https://software.intel.com/en-us/forums/intel-isa-extensions/topic/710248 + reg = 'ZMM' if 'ZMM' in instrNode.attrib['iform'] else 'YMM' + # the instruction needs to be used at least twice in the body of the loop + # putting it to one_time_init is not sufficient, independently of the loop count, example: + # "VPTEST YMM0, YMM1;CMOVZ R13, R15; VPBROADCASTQ ZMM0, R13" on CNL + avxInitCode = 'MOV R15, 10000; L: VADDPS {0}, {1}, {1}; VADDPS {0}, {1}, {1}; DEC R15; JNZ L; '.format(reg + '0', reg + '1') + initCode = avxInitCode + initCode + + nanoBenchCmd = 'sudo ./kernel-nanoBench.sh' + nanoBenchCmd += ' -unroll ' + str(unrollCount) + if loopCount > 0: nanoBenchCmd += ' -loop ' + str(loopCount) + if basicMode: nanoBenchCmd += ' -basic' + nanoBenchCmd += ' -warm_up_count ' + str(warmUpCount) + nanoBenchCmd += ' -asm "' + instrCode + '"' + + initObjFile = None + if initCode: + if debugOutput: print 'init: ' + initCode + initObjFile = '/tmp/ramdisk/init.o' + assemble(initCode, initObjFile, asmFile='/tmp/ramdisk/init.s') + localHtmlReports.append('
  • Init:
    ' + re.sub(';[ \t]*(.)', r';\n\1', initCode) + '
  • \n') + nanoBenchCmd += ' -asm_init "' + initCode + '"' + + localHtmlReports.append('
  • Show nanoBench command
  • \n') + + setNanoBenchParameters(unrollCount=unrollCount, loopCount=loopCount, warmUpCount=warmUpCount, basicMode=basicMode) + + ret = runNanoBench(codeObjFile=codeObjFile, initObjFile=initObjFile) + + localHtmlReports.append('
  • Results:\n\n
  • ') + + if arch in ['NHM', 'WSM'] and 'UOPS_PORT3' in ret: + # Workaround for broken port4 and port5 counters + ret['UOPS_PORT4'] = ret['UOPS_PORT3'] + ret['UOPS_PORT5'] = max(0, ret['UOPS'] - ret['UOPS_PORT0'] - ret['UOPS_PORT1'] - ret['UOPS_PORT2'] - ret['UOPS_PORT3'] - ret['UOPS_PORT4']) + + if isAMDCPU(): + ret['Core cycles'] = ret['APERF'] + + if maxRepeat>0: + if any(v<-0.05 for v in ret.values()): + print 'Repeating experiment because there was a value < 0' + return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=basicMode, htmlReports=htmlReports, maxRepeat=maxRepeat-1) + + #sumPortUops = sum(v for e,v in ret.items() if 'PORT' in e and not '4' in e) + #if (sumPortUops % 1) > .2 and (sumPortUops % 1) < .8: + # print 'Repeating experiment because the sum of the port usages is not an integer' + # print ret + # return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=basicMode, htmlReports=htmlReports, maxRepeat=maxRepeat-1) + + if any('PORT' in e for e in ret): + maxPortUops = max(v/(len(e)-9) for e,v in ret.items() if 'PORT' in e) + if maxPortUops * .98 > ret['Core cycles']: + print 'Repeating experiment because there were more uops on a port than core cycles' + return runExperiment(instrNode, instrCode, init=init, unrollCount=unrollCount, loopCount=loopCount, basicMode=basicMode, htmlReports=htmlReports, maxRepeat=maxRepeat-1) + + if htmlReports is not None: + htmlReports.extend(localHtmlReports) + return ret + + +def writeFile(fileName, content): + with open(fileName, "w") as f: + f.write(content+"\n"); + + +def getMachineCode(objFile): + try: + machineCode = subprocess.check_output(['objdump', '-M', 'intel', '-d', objFile]) + return machineCode.partition('<.text>:\n')[2] + except subprocess.CalledProcessError as e: + print "Error (getMachineCode): " + str(e) + + +def getEventConfig(event): + if event == 'UOPS': + if arch in ['CON', 'WOL']: return 'A0.00' # RS_UOPS_DISPATCHED + if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW']: return 'C2.01' # UOPS_RETIRED.ALL + if arch in ['SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'B1.01' # UOPS_EXECUTED.THREAD + if arch in ['ZEN+', 'ZEN2']: return '0C1.00' + if event == 'RETIRE_SLOTS': + if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'C2.02' + if event == 'UOPS_PORT0': + if arch in ['CON', 'WOL']: return 'A1.01.CTR=0' + if arch in ['NHM', 'WSM']: return 'B1.01' + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'A1.01' + if event == 'UOPS_PORT1': + if arch in ['CON', 'WOL']: return 'A1.02.CTR=0' + if arch in ['NHM', 'WSM']: return 'B1.02' + if arch in ['SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'A1.02' + if event == 'UOPS_PORT2': + if arch in ['CON', 'WOL']: return 'A1.04.CTR=0' + if arch in ['NHM', 'WSM']: return 'B1.04' + if arch in ['SNB', 'IVB']: return 'A1.0C' + if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'A1.04' + if event == 'UOPS_PORT3': + if arch in ['CON', 'WOL']: return 'A1.08.CTR=0' + if arch in ['NHM', 'WSM']: return 'B1.08' + if arch in ['SNB', 'IVB']: return 'A1.30' + if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'A1.08' + if event == 'UOPS_PORT4': + if arch in ['CON', 'WOL']: return 'A1.10.CTR=0' + if arch in ['NHM', 'WSM']: return 'B1.10' + if arch in ['SNB', 'IVB']: return 'A1.40' + if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'A1.10' + if event == 'UOPS_PORT5': + if arch in ['CON', 'WOL']: return 'A1.20.CTR=0' + if arch in ['NHM', 'WSM']: return 'B1.20' + if arch in ['SNB', 'IVB']: return 'A1.80' + if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'A1.20' + if event == 'UOPS_PORT6': + if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return 'A1.40' + if event == 'UOPS_PORT7': + if arch in ['HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return 'A1.80' + if event == 'UOPS_PORT23': + if arch in ['ICL']: return 'A1.04' + if event == 'UOPS_PORT49': + if arch in ['ICL']: return 'A1.10' + if event == 'UOPS_PORT78': + if arch in ['ICL']: return 'A1.80' + if event == 'DIV_CYCLES': + if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL']: return '14.01.CMSK=1' # undocumented on HSW, but seems to work + if arch in ['ICL']: return '14.09.CMSK=1' + if arch in ['ZEN+', 'ZEN2']: return '0D3.00' + if event == 'ILD_STALL.LCP': + if arch in ['NHM', 'WSM', 'SNB', 'IVB', 'HSW', 'BDW', 'SKL', 'SKX', 'KBL', 'CFL', 'CNL', 'ICL']: return '87.01' + if event == 'INST_DECODED.DEC0': + if arch in ['NHM', 'WSM']: return '18.01' + if event == 'FpuPipeAssignment.Total0': + if arch in ['ZEN+', 'ZEN2']: return '000.01' + if event == 'FpuPipeAssignment.Total1': + if arch in ['ZEN+', 'ZEN2']: return '000.02' + if event == 'FpuPipeAssignment.Total2': + if arch in ['ZEN+', 'ZEN2']: return '000.04' + if event == 'FpuPipeAssignment.Total3': + if arch in ['ZEN+', 'ZEN2']: return '000.08' + return None + + +def configurePFCs(events): + content = '' + for event in events: + cfg = getEventConfig(event) + if cfg is not None: + content += cfg + ' ' + event + '\n' + setNanoBenchParameters(config=content) + + +InstrInstance = namedtuple('InstrInstance', ['instrNode', 'asm', 'readRegs', 'writtenRegs', 'opRegDict', 'regMemInit']) + +def getInstrInstanceFromNode(instrNode, doNotWriteRegs=None, doNotReadRegs=None, useDistinctRegs=True, opRegDict=None, memOffset=0, immediate=2, computeRegMemInit=True): + if not doNotWriteRegs: doNotWriteRegs = [] + if not doNotReadRegs: doNotReadRegs = [] + if not opRegDict: opRegDict = {} + + if instrNode.attrib['extension'] == 'AVX2GATHER': useDistinctRegs=True + + readRegs = set() + writtenRegs = set() + opRegDict = dict(opRegDict) + + for operandNode in instrNode.iter('operand'): + if operandNode.attrib['type'] == "reg": + regsList = sortRegs(operandNode.text.split(',')) + if len(regsList) == 1: + reg = regsList[0] + opRegDict[int(operandNode.attrib['idx'])] = reg + if operandNode.attrib.get('w', '0') == '1': + writtenRegs.add(reg) + if operandNode.attrib.get('r', '0') == '1': + readRegs.add(reg) + elif operandNode.attrib['type'] == "mem" and 'base' in operandNode.attrib: + readRegs.add(operandNode.attrib['base']) + + asm = instrNode.attrib['asm'] + + first = True + for operandNode in instrNode.iter('operand'): + opI = int(operandNode.attrib['idx']) + + if operandNode.attrib.get('suppressed', '0') == '1': + continue; + + if not first and not operandNode.attrib.get('opmask', '') == '1': + asm += ", " + else: + asm += " " + first=False; + + if operandNode.attrib['type'] == "reg": + if opI in opRegDict: + reg = opRegDict[opI] + else: + regsList = operandNode.text.split(',') + + if len(regsList) > 1: + ignoreRegs = set() + if operandNode.attrib.get('w', '0') == '1': + ignoreRegs |= set(doNotWriteRegs)|globalDoNotWriteRegs|(set(opRegDict.values()) if useDistinctRegs else set(doNotReadRegs)) + if operandNode.attrib.get('r', '0') == '1': + ignoreRegs |= set(doNotReadRegs)|(writtenRegs|readRegs|set(opRegDict.values()) if useDistinctRegs else set(doNotWriteRegs)|globalDoNotWriteRegs) + regsList = filter(lambda x: not any(y in ignoreRegs for y in getSubRegs(x)) and not (x in [z for y in ignoreRegs for z in getSubRegs(y)]), regsList) + if not regsList: + return None; + + reg = sortRegs(regsList)[0]; + if not useDistinctRegs: + for oReg in opRegDict.values(): + if oReg in regsList: + reg = oReg; + break + + opRegDict[opI] = reg + if operandNode.attrib.get('w', '0') == '1': + writtenRegs.add(reg) + if operandNode.attrib.get('r', '0') == '1': + readRegs.add(reg) + + if not operandNode.attrib.get('opmask', '') == '1': + asm += reg + else: + asm += ' {' + reg + '}' + if instrNode.attrib.get('zeroing', '') == '1': + asm += '{z}' + elif operandNode.attrib['type'] == "mem": + asmprefix = operandNode.attrib.get('memory-prefix', '') + asm += asmprefix + if asmprefix != '': + asm += ' ' + + address = '' + if operandNode.attrib.get('VSIB', '0') != "0": + address = 'R14+' + operandNode.attrib.get('VSIB') + '14' + readRegs.add('R14') + readRegs.add(operandNode.attrib.get('VSIB') + '14') + else: + address = getAddrReg(instrNode, operandNode) + readRegs.add(address) + + asm += '[' + address + ('+'+str(memOffset) if memOffset else '') + ']' + + memorySuffix = operandNode.attrib.get('memory-suffix', '') + if memorySuffix: + asm += ' ' + memorySuffix + elif operandNode.attrib['type'] == 'agen': + agen = instrNode.attrib['agen'] + address = [] + + if 'R' in agen: address.append('RIP') + if 'B' in agen: address.append('R14') + if 'I' in agen: address.append('2*R13') + if 'D' in agen: address.append('8') + + asm += ' [' + '+'.join(address) + ']' + elif operandNode.attrib['type'] == "imm": + if instrNode.attrib.get('roundc', '') == '1': + asm += '{rn-sae}, ' + elif instrNode.attrib.get('sae', '') == '1': + asm += '{sae}, ' + width = int(operandNode.attrib['width']) + if operandNode.text: + imm = operandNode.text + elif (width == 8 or instrNode.attrib['iclass'] in ['ENTER', 'RET_FAR', 'RET_NEAR']): + imm = immediate + else: + imm = 1 << (width-8) + asm += str(imm) + elif operandNode.attrib['type'] == "relbr": + asm += "1f" + + if not 'sae' in asm: + if instrNode.attrib.get('roundc', '') == '1': + asm += ', {rn-sae}' + elif instrNode.attrib.get('sae', '') == '1': + asm += ', {sae}' + + if '1f' in asm: + asm = asm + '; 1: ' + + regMemInit = [] + if computeRegMemInit: regMemInit = getRegMemInit(instrNode, opRegDict, memOffset) + return InstrInstance(instrNode, asm, readRegs, writtenRegs, opRegDict, regMemInit) + +def createIacaAsmFile(fileName, prefixInstr, prefixRep, instr): + asm = '.intel_syntax noprefix\n .byte 0x0F, 0x0B; mov ebx, 111; .byte 0x64, 0x67, 0x90\n' + if prefixInstr: + for i in xrange(prefixRep): + asm += prefixInstr + "\n" + asm += instr + "\n" + asm += "1:\n" + asm += 'mov ebx, 222; .byte 0x64, 0x67, 0x90; .byte 0x0F, 0x0B\n' + writeFile(fileName, asm) + + +def getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockInstrNode, blockInstrRep, blockedPorts, config, htmlReports): + instrInstance = config.independentInstrs[0] + instr = instrInstance.asm + readRegs = instrInstance.readRegs + writtenRegs = instrInstance.writtenRegs + + if debugOutput: print ' instr: ' + instr + 'rR: ' + str(readRegs) + ', wR: ' + str(writtenRegs) + blockInstrsList = getIndependentInstructions(blockInstrNode, True, writtenRegs|readRegs, writtenRegs|readRegs, 64) + if debugOutput: print ' bIL: ' + str(blockInstrsList) + + htmlReports.append('

    With blocking instructions for port' + + ('s {' if len(blockedPorts)>1 else ' ') + + str(list(blockedPorts))[1:-1] + + ('}' if len(blockedPorts)>1 else '') + ':

    ') + + if useIACA: + createIacaAsmFile("/tmp/ramdisk/asm.s", ';'.join(islice(cycle(x.asm for x in blockInstrsList), blockInstrRep)), 1, instr) + + try: + subprocess.check_output(['as', '/tmp/ramdisk/asm.s', '-o', '/tmp/ramdisk/asm.o']) + iacaOut = subprocess.check_output(iacaCMDLine + (['-analysis', 'THROUGHPUT'] if iacaVersion=='2.1' else []) + ['/tmp/ramdisk/asm.o'], stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + print "Error: " + e.output + return None + + if not iacaOut or ' !' in iacaOut or ' X' in iacaOut or ' 0X' in iacaOut or not 'Total Num Of Uops' in iacaOut: + print "IACA error" + return None + + allPortsLine = re.search('\| Cycles \|.*', iacaOut).group(0) + instrPortsLine = iacaOut.split('\n')[-3] + + allUopsOnBlockedPorts = 0.0 + instrUopsOnBlockedPorts = 0.0 + + for p in blockedPorts: + allPortsCol = allPortsLine.split('|')[p+2].split() + allUopsOnBlockedPorts += float(allPortsCol[0]) + + instrPortsCol = instrPortsLine.split('|')[p+2].split() + if instrPortsCol: + instrUopsOnBlockedPorts += float(instrPortsCol[0]) + + htmlReports.append('
    ' + iacaOut + '
    ') + + if allUopsOnBlockedPorts < blockInstrRep-.5: + # something went wrong; fewer uops on ports than blockInstrRep + # happens, e.g., on SKX for ports {0, 1} if AVX-512 is active + return None + + return int(.2+instrUopsOnBlockedPorts) + else: + + if isIntelCPU(): + if arch in ['NHM', 'WSM']: + # Needed for workaround for broken port 5 counter + events = ['UOPS_PORT'+str(p) for p in range(0,6)] + ['UOPS'] + else: + events = ['UOPS_PORT'+str(p) for p in blockedPorts] + else: + events = ['FpuPipeAssignment.Total0', 'FpuPipeAssignment.Total1', 'FpuPipeAssignment.Total2', 'FpuPipeAssignment.Total3'] + configurePFCs(events) + + blockInstrAsm = ';'.join(islice(cycle(x.asm for x in blockInstrsList), blockInstrRep)) + + unrollCount = 1000/blockInstrRep # make sure that instrs. fit into icache + if isAMDCPU(): unrollCount = max(unrollCount, 100) # ZEN+ sometimes undercounts FP usage if code is short + + + init = list(chain.from_iterable([x.regMemInit for x in blockInstrsList])) + instrInstance.regMemInit + config.init + + htmlReports.append('\n') + + if float(measurementResult['Core cycles']) < -10: + #something went wrong; this happens for example on HSW with long sequences of JMP instructions + if debugOutput: print "Core cycles < -10 in getUopsOnBlockedPorts" + + if sum(u for p, u in measurementResult.items() if ('UOPS_PORT' in p or 'FpuPipeAssignment.Total' in p)) < blockInstrRep-.5: + # something went wrong; fewer uops on ports than blockInstrRep + # happens, e.g., on SKX for ports {0, 1} if AVX-512 is active + return None + + if isIntelCPU(): + ports_dict = {int(p[9:]): i for p, i in measurementResult.items() if p.startswith('UOPS_PORT')} + else: + ports_dict = {int(p[23:]): i for p, i in measurementResult.items() if 'FpuPipeAssignment.Total' in p} + + return int(.2+sum([uops for p, uops in ports_dict.items() if p in blockedPorts])) - blockInstrRep + + +# Takes an instrNode and returns a list [instrI, instrI', ...] s.t. instrI(')* are the results of +# calls to getInstrInstanceFromNode for instrNode and there are no read-after-writes of the same regs/memory locations. The length of the list is limited by maxTPRep. +def getIndependentInstructions(instrNode, useDistinctRegs, doNotReadRegs = None, doNotWriteRegs = None, initialOffset = 0, immediate = 2): + if not doNotReadRegs: doNotReadRegs = set() + if not doNotWriteRegs: doNotWriteRegs = set() + doNotReadRegs |= specialRegs + doNotWriteRegs |= globalDoNotWriteRegs|specialRegs + + for opNode in instrNode.iter('operand'): + if opNode.attrib['type'] == 'reg': + regs = sortRegs(opNode.text.split(",")) + if len(regs) == 1: + doNotReadRegs.add(regs[0]) + doNotWriteRegs.add(regs[0]) + if len(regs) >= 8 and 'RAX' in map(regTo64, regs): + #avoid RAX register if possible as some instructions have a special encoding for this + doNotReadRegs.add('RAX') + doNotWriteRegs.add('RAX') + + independentInstructions = [] + offset = initialOffset + + for _ in range(maxTPRep): + instrI = getInstrInstanceFromNode(instrNode, doNotWriteRegs, doNotReadRegs, useDistinctRegs, {}, offset, immediate=immediate) + if not instrI: + break + + if instrI in independentInstructions: + break + + maxMemWidth = 0 + for memNode in instrNode.findall('./operand[@type="mem"][@w="1"]'): + maxMemWidth = max(maxMemWidth, int(memNode.attrib.get('width', '0'))/8) + offset += maxMemWidth + + independentInstructions.append(instrI) + + doNotWriteRegs = doNotWriteRegs | instrI.writtenRegs | instrI.readRegs + doNotReadRegs = doNotReadRegs | instrI.writtenRegs + + if not independentInstructions: + instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=False, immediate=immediate) + independentInstructions.append(instrI) + + return independentInstructions + +# Returns True iff there are two non-suppressed operands that can use the same register +def hasCommonRegister(instrNode): + for opNode1 in instrNode.iter('operand'): + if opNode1.attrib.get('suppressed', '0') == '1': continue + if not opNode1.attrib['type'] == 'reg': continue + regs1 = opNode1.text.split(",") + for opNode2 in instrNode.iter('operand'): + if opNode1 == opNode2: continue + if opNode2.attrib.get('suppressed', '0') == '1': continue + if not opNode2.attrib['type'] == 'reg': continue + regs2 = opNode2.text.split(",") + if set(regs1).intersection(set(regs2)): + return True + return False + +def getThroughputIacaNoInteriteration(instrNode, htmlReports): + createIacaAsmFile("/tmp/ramdisk/asm.s", "", 0, getInstrInstanceFromNode(instrNode, useDistinctRegs=True).asm) + try: + subprocess.check_output(['as', '/tmp/ramdisk/asm.s', '-o', '/tmp/ramdisk/asm.o']) + iaca_tp = subprocess.check_output(iacaCMDLine + (['-analysis', 'THROUGHPUT'] if iacaVersion=='2.1' else []) + ['-no_interiteration', '/tmp/ramdisk/asm.o'], stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + print "Error: " + e.output + return None + + if debugOutput: + print instrNode.attrib['iform'] + ' - NoInteriteration' + print iaca_tp + + htmlReports.append('
    ' + iaca_tp + '
    \n') + + if not iaca_tp or ' !' in iaca_tp or ' X' in iaca_tp or ' 0X' in iaca_tp or not 'Total Num Of Uops' in iaca_tp: + print "IACA error" + return None + + cycles = float(iaca_tp.split('\n')[3].split()[2]) + return cycles + +class TPConfig: + def __init__(self, independentInstrs=None, depBreakingInstrs='', init=None, preInstrCode='', preInstrNodes=None, note=''): + self.independentInstrs = ([] if independentInstrs is None else independentInstrs) + self.depBreakingInstrs = depBreakingInstrs + self.init = ([] if init is None else init) + self.preInstrCode = preInstrCode + self.preInstrNodes = ([] if preInstrNodes is None else preInstrNodes) + self.note = note + +def getTPConfigs(instrNode, useDistinctRegs=True, computeIndepAndDepBreakingInstrs=True): + iform = instrNode.attrib['iform'] + iclass = instrNode.attrib['iclass'] + + if 'DIV' in iclass or 'SQRT' in iclass: + return getTPConfigsForDiv(instrNode) + + independentInstrs = [] + depBreakingInstrs = '' + if computeIndepAndDepBreakingInstrs: + independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs) + depBreakingInstrs = getDependencyBreakingInstrsForSuppressedOperands(instrNode) + + # instructions with multiple configs + if 'I8' in instrNode.attrib['string']: + configs = [] + for immediate in [0, 1, 2]: + if instrNode.attrib['string'].replace('I8', str(immediate)) in instrNodeDict: + continue + config = TPConfig(note='With immediate = ' + str(immediate)) + config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, immediate=immediate) + config.depBreakingInstrs = depBreakingInstrs + configs.append(config) + return configs + + if iclass in ['JB', 'JBE', 'JLE', 'JNB', 'JNBE', 'JNLE', 'JNO', 'JNP', 'JNS', 'JNZ', 'JO', 'JP', 'JS', 'JZ']: + config0 = TPConfig(independentInstrs=independentInstrs, init=['pushfq; and qword ptr [RSP], ~0x8D5; popfq'], note='With all flags set to 0') + config1 = TPConfig(independentInstrs=independentInstrs, init=['pushfq; or qword ptr [RSP], 0x8D5; popfq'], note='With all flags set to 1') + return [config0, config1] + + if iclass in ['JL', 'JNL']: + config0 = TPConfig(independentInstrs=independentInstrs, init=['pushfq; and qword ptr [RSP], ~0x8D5; popfq'], note='With SF=OF') + config1 = TPConfig(independentInstrs=independentInstrs, init=['pushfq; and qword ptr [RSP], ~0x8D5; or qword ptr [RSP], 0x80; popfq'], note='With SF!=OF') + return [config0, config1] + + if iclass in ['JRCXZ']: + config0 = TPConfig(independentInstrs=independentInstrs, init=['mov RCX, 0'], note='With RCX=0') + config1 = TPConfig(independentInstrs=independentInstrs, init=['mov RCX, 1'], note='With RCX=1') + return [config0, config1] + + if 'LOOP' in iform or 'REP' in iform: + configs = [] + for regVal in ['0', '1', '2']: + config = TPConfig(independentInstrs=independentInstrs, preInstrCode='mov RCX, '+regVal, note='With RCX='+regVal) + if instrNode.attrib['category'] in ['IOSTRINGOP']: + config.init = ['mov DX, 0x80'] + configs.append(config) + return configs + + # instructions with one config + preInstrCode, preInstrNodes = getPreInstr(instrNode) + config = TPConfig(independentInstrs, depBreakingInstrs, [], preInstrCode, preInstrNodes) + + if re.search('BT.*MEMv_GPRv', iform): + config.init = list(set('mov ' + regTo64(r) + ', 0' for i in independentInstrs for r in i.readRegs if not regTo64(r) in globalDoNotWriteRegs)) + + if iform in ['CALL_NEAR_GPRv', 'JMP_GPRv']: + config.independentInstrs = [getInstrInstanceFromNode(instrNode, opRegDict={1: 'RAX'})] + + if iform in ['CALL_NEAR_MEMv', 'JMP_MEMv']: + config.independentInstrs = [getInstrInstanceFromNode(instrNode)] + + if iclass == 'FXRSTOR': config.init = ['FXSAVE [R14]'] + if iclass == 'FXRSTOR64': config.init = ['FXSAVE64 [R14]'] + + if iform in ['IN_AL_IMMb', 'IN_OeAX_IMMb', 'OUT_IMMb_AL', 'OUT_IMMb_OeAX']: + config.independentInstrs = getIndependentInstructions(instrNode, useDistinctRegs, immediate=0x80) + + if iform in ['IN_AL_DX', 'IN_OeAX_DX', 'OUT_DX_AL', 'OUT_DX_OeAX'] or instrNode.attrib['category'] in ['IOSTRINGOP']: + config.init = ['mov DX, 0x80'] + + if iform == 'LLDT_GPR16': config.init = list(set('SLDT ' + reg for i in independentInstrs for reg in i.readRegs)) + if iform == 'LMSW_GPR16': config.init = list(set('SMSW ' + reg for i in independentInstrs for reg in i.readRegs)) + if iform == 'LMSW_MEMw': config.init = list(['SMSW [R14+'+str(i*64)+']' for i in range(0,maxTPRep)]) + + if iform == 'POPF': + config.init = ['PUSHF; POP AX'] + if iform == 'POPFQ': + config.init = ['PUSHFQ; pop RAX'] + + if iform in ['RDMSR', 'WRMSR']: config.init = ['MOV RCX, 0xE7'] #TSC Frequency Clock Counter + if iform in ['RDPMC']: config.init = ['MOV RCX, 0'] + + if iform == 'RET_NEAR_IMMw': + config.independentInstrs = [getInstrInstanceFromNode(instrNode, immediate=8)] + + return [config] + +def getPreInstr(instrNode): + iform = instrNode.attrib['iform'] + preInstrCode = '' + preInstrNodes = None + + if iform in ['CALL_NEAR_GPRv', 'JMP_GPRv']: + preInstrCode = 'lea RAX, [RIP+2]' + preInstrNodes = [instrNodeDict['LEA_RD (R64)']] + + if iform in ['CALL_NEAR_MEMv', 'JMP_MEMv']: + preInstrCode = 'lea RAX, [RIP+6]; mov [R14], RAX' + preInstrNodes = [instrNodeDict['LEA_RD (R64)'], instrNodeDict['MOV (M64, RAX)']] + + if iform == 'LEAVE': + preInstrCode = 'lea RBP, [R14]' + preInstrNodes = [instrNodeDict['LEA_B (R64)']] + + if iform == 'POPF': + preInstrCode = 'PUSH AX' + preInstrNodes = [instrNodeDict['PUSH (R16)']] + + if iform == 'POPFQ': + preInstrCode = 'PUSH RAX' + preInstrNodes = [instrNodeDict['PUSH (R64)']] + + if iform == 'RET_NEAR': + preInstrCode = 'lea RAX, [RIP+5]; mov [RSP], RAX' + preInstrNodes = [instrNodeDict['LEA_RD (R64)'], instrNodeDict['MOV (M64, RAX)']] + + if iform == 'RET_NEAR_IMMw': + preInstrCode = 'lea RAX, [RIP+7]; mov [RSP], RAX' + preInstrNodes = [instrNodeDict['LEA_RD (R64)'], instrNodeDict['MOV (M64, RAX)']] + + return (preInstrCode, preInstrNodes) + +# Returns [minConfig, maxConfig] +def getTPConfigsForDiv(instrNode): + memDivisor = len(instrNode.findall('./operand[@type="mem"]'))>0 + iclass = instrNode.attrib['iclass'] + + minConfig = TPConfig(note='Fast division') + maxConfig = TPConfig(note='Slow division') + + if iclass in ['DIV', 'IDIV']: + for op in instrNode.iter('operand'): + if op.attrib.get('suppressed', '0') == '0': + memDivisor = op.attrib['type'] == 'mem' + width = int(op.attrib['width']) + if width == 8: + maxConfig.preInstrCode = 'MOV AX, 13057' + maxConfig.preInstrNodes = [instrNodeDict['MOV (R16, I16)']] + maxDivisor = '123' + elif width == 16: + maxConfig.preInstrCode = 'MOV AX, 133; MOV DX, 0x343a' + maxConfig.preInstrNodes = [instrNodeDict['MOV (R16, I16)'], instrNodeDict['MOV (R16, I16)']] + maxDivisor = '0x75e6' + elif width == 32: + maxConfig.preInstrCode = 'MOV EAX, 133; MOV EDX, 0x343a9ed7' + maxConfig.preInstrNodes = [instrNodeDict['MOV (R32, I32)'], instrNodeDict['MOV (R32, I32)']] + maxDivisor = '0x75e6e44f' + else: + maxConfig.preInstrCode = 'MOV RAX, 133; MOV RDX, 0x343a9ed744556677' + maxConfig.preInstrNodes = [instrNodeDict['MOV (R64, I32)'],instrNodeDict['MOV (R64, I64)']] + maxDivisor = '0x75e6e44fccddeeff' + + if memDivisor: + memPrefix = instrNode.findall('./operand[@type="mem"]')[0].attrib['memory-prefix'] + minConfig.init = ['MOV ' + memPrefix + ' [R14], 1'] + maxConfig.init = ['MOV ' + regToSize('R8', width) + ', ' + maxDivisor + '; MOV ' + memPrefix + ' [R14], ' + regToSize('R8', width)] + instrI = getInstrInstanceFromNode(instrNode) + else: + minConfig.init = ['MOV ' + regToSize('RBX', width) + ', 1'] + maxConfig.init = ['MOV ' + regToSize('RBX', width) + ', ' + maxDivisor] + instrI = getInstrInstanceFromNode(instrNode, opRegDict={int(op.attrib['idx']):regToSize('RBX', width)}) + + minConfig.independentInstrs = [instrI] + maxConfig.independentInstrs = [instrI] + minConfig.init += ['MOV RAX, 0; MOV RDX, 0'] + minConfig.preInstrCode = 'MOV RAX, 0; MOV RDX, 0' + minConfig.preInstrNodes = [instrNodeDict['MOV (R64, I32)'], instrNodeDict['MOV (R64, I32)']] + elif iclass in ['DIVSS', 'DIVPS', 'DIVSD', 'DIVPD', 'VDIVSS', 'VDIVPS', 'VDIVSD', 'VDIVPD']: + dataType = iclass[-1] + if dataType == 'S': + maxDividend = '0x54ed392654ed3926' #8.15093E12 in high and low 32-bit + maxDivisor = '0x3f99f4c03f99f4c0' #1.20278 in high and low 32-bit + minDividend = '0x3f8000003f800000' #1.0 in high and low 32-bit + minDivisor = '0x3f8000003f800000' #1.0 in high and low 32-bit + else: + maxDividend = '0x429da724b687da66' #8.1509281715106E12 + maxDivisor = '0x3ff33e97f934078b' #1.20278165192619 + minDividend = '0x3ff0000000000000' #1.0 + minDivisor = '0x3ff0000000000000' #1.0 + + for config, dividend, divisor in [(maxConfig, maxDividend, maxDivisor), (minConfig, minDividend, minDivisor)]: + config.init = ['MOV RAX, ' + dividend] + config.init += ['MOV RBX, ' + divisor] + for i in range(0, 64, 8): config.init += ['MOV [R14+' + str(i) + '], RBX'] + for i in range(64, 128, 8): config.init += ['MOV [R14+' + str(i) + '], RAX'] + + if instrNode.attrib['iclass'] in ['DIVSS', 'DIVPS', 'DIVSD', 'DIVPD']: + config.init += ['MOVUP' + dataType + ' XMM0, [R14]'] + config.init += ['MOVUP' + dataType + ' XMM1, [R14+64]'] + config.init += ['MOVUP' + dataType + ' XMM2, XMM1'] + config.preInstrCode = 'MOVUP' + dataType + ' XMM2, XMM1; ' + config.preInstrNodes = [instrNodeDict['MOVUP' + dataType + '_0F10 (XMM, XMM)']] + config.independentInstrs = [getInstrInstanceFromNode(instrNode, opRegDict={1:'XMM2', 2:'XMM0'})] + else: + regType = 'XMM' + if 'YMM' in instrNode.attrib['iform']: regType = 'YMM' + if 'ZMM' in instrNode.attrib['iform']: regType = 'ZMM' + + nOperands = len(instrNode.findall('./operand')) + + dividendReg = regType + '0' + divisorReg = regType + '1' + + config.init += ['VMOVUP' + dataType + ' ' + dividendReg + ', [R14+64]'] + config.init += ['VMOVUP' + dataType + ' ' + divisorReg + ', [R14]'] + + config.independentInstrs = [getInstrInstanceFromNode(instrNode, opRegDict={1:regType+str(reg), (nOperands-1):dividendReg, nOperands:divisorReg}) for reg in range(2, 10)] + elif instrNode.attrib['iclass'] in ['SQRTSS', 'SQRTPS', 'SQRTSD', 'SQRTPD', 'RSQRTSS', 'RSQRTPS', 'RCPSS', 'RCPPS', 'VSQRTSS', 'VSQRTPS', 'VSQRTSD', 'VSQRTPD','VRSQRTSS', 'VRSQRTPS', 'VRCPSS', 'VRCPPS', 'VRSQRT14SS', 'VRSQRT14SD', 'VRSQRT14PS', 'VRSQRT14PD']: + dataType = instrNode.attrib['iclass'][-1] + + if dataType == 'S': + maxArg = '0x72d30ff172d30ff1' #8.36104E30 in high and low 32-bit + minArg = '0x3f8000003f800000' #1.0 in high and low 32-bit + else: + maxArg = '0x465a61fe1acdc21c' #8.3610378602352937E30 + minArg = '0x3ff0000000000000' #1.0 + + instrPrefix = '' + if instrNode.attrib['iclass'].startswith('V'): instrPrefix = 'V' + + for arg, config in [(maxArg, maxConfig), (minArg, minConfig)]: + regType = 'XMM' + if 'YMM' in instrNode.attrib['iform']: regType = 'YMM' + if 'ZMM' in instrNode.attrib['iform']: regType = 'ZMM' + + targetRegIdx = min(int(opNode.attrib['idx']) for opNode in instrNode.findall('./operand') if opNode.text and regType in opNode.text) + + config.init = ['MOV RAX, ' + arg] + + if memDivisor: + for i in range(0, 64, 8): config.init += ['MOV [R14+' + str(i) + '], RAX'] + instrs = [getInstrInstanceFromNode(instrNode, opRegDict={targetRegIdx:regType+str(reg)}) for reg in range(2, 10)] + else: + sourceReg = regType + '0' + config.init += [instrPrefix + 'MOVUP' + dataType + ' ' + sourceReg + ', [R14]'] + sourceRegIdx = max(int(opNode.attrib['idx']) for opNode in instrNode.findall('./operand') if opNode.text and regType in opNode.text) + instrs = [getInstrInstanceFromNode(instrNode, opRegDict={targetRegIdx:regType+str(reg), sourceRegIdx: sourceReg}) for reg in range(2, 10)] + + config.independentInstrs = instrs + + return [minConfig, maxConfig] + + +# rounds to the nearest multiple of 1/5, 1/4, or 1/3 (in that order) if the value is at most 0.015 smaller or larger than this multiple; +# otherwise rounds to two decimals +def fancyRound(cycles): + round5 = round(round(cycles*5)/5, 2) + round4 = round(round(cycles*4)/4, 2) + round3 = round(round(cycles*3)/3, 2) + if abs(round5-cycles) <= 0.015: + return round5 + elif abs(round4-cycles) <= 0.015: + return round4 + elif abs(round3-cycles) <= 0.015: + return round3 + return round(cycles, 2) + + +TPResult = namedtuple('TPResult', ['TP', 'TP_noDepBreaking_noLoop', 'TP_single', 'uops', 'fused_uops', 'divCycles', 'ILD_stalls', 'dec0', 'config', 'unblocked_ports']) + +# returns TPResult +# port usages are averages (when no ports are blocked by other instructions) +def getThroughputAndUops(instrNode, useDistinctRegs, htmlReports): + configs = getTPConfigs(instrNode, useDistinctRegs) + + minTP = sys.maxint + minTP_noDepBreaking_noLoop = sys.maxint + minTP_single = sys.maxint + + if useIACA: + config = configs[0] # consider only first config as IACA does not seem to consider different values in registers + + instrList = [x.asm for x in config.independentInstrs] + for ic in sorted(set([1, len(instrList)])): + if len(instrList) > 1: htmlReports.append('

    With ' + str(ic) + ' independent instruction' + ('s' if ic>1 else '') + '

    \n') + if ic > 1: htmlReports.append('
    \n') + for useDepBreakingInstrs in [False, True]: + if useDepBreakingInstrs: + if not config.depBreakingInstrs: continue + instrStr = ";".join([i+';'+config.depBreakingInstrs for i in instrList[0:ic]]) + htmlReports.append('

    With additional dependency-breaking instructions

    \n') + else: + instrStr = ";".join(instrList[0:ic]) + + createIacaAsmFile("/tmp/ramdisk/asm.s", "", 0, instrStr) + try: + subprocess.check_output(['as', '/tmp/ramdisk/asm.s', '-o', '/tmp/ramdisk/asm.o']) + iaca_out = subprocess.check_output(iacaCMDLine + ['/tmp/ramdisk/asm.o'], stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + logging.warn('Error: ' + e.output) + if minTP != sys.maxint: + htmlReports.append('
    ' + e.output + '
    \n') + continue # on SNB, IACA 2.2 crashes on only some (larger) inputs + else: + return None + + if not iaca_out or ' ! ' in iaca_out or ' X ' in iaca_out or ' 0X ' in iaca_out or not 'Total Num Of Uops' in iaca_out: + print "IACA error" + return None + + print instrNode.attrib['iform'] + ' - throughput' + + htmlReports.append('
    ' + iaca_out + '
    \n') + + cycles = float(iaca_out.split('\n')[3].split()[2]) + cycles = cycles/ic + minTP = min(minTP, cycles) + if not useDepBreakingInstrs: minTP_noDepBreaking_noLoop = min(minTP_noDepBreaking_noLoop, cycles) + + if ic == 1 and not useDepBreakingInstrs: + minTP_single = min(minTP_single, cycles) + + unfused_uops_line = iaca_out.split('\n')[-2] + unfused_uops = int(unfused_uops_line.split()[4])/ic + + ports_line = iaca_out.split('\n')[-3] + fused_uops = '^' in ports_line.split()[1] + + num_ports = re.search('\| Port \|.*', iaca_out).group(0).count('|')-2 + + ports_dict = {} + for p in range(0, num_ports): + portCol = ports_line.split('|')[p+2].split() + if portCol: + ports_dict[p] = float(portCol[0]) + else: + ports_dict[p] = 0.0 + + port0 = ports_line.split('|')[2].split() + if len(port0)>1: + divCycles = int(float(port0[1])) + else: + divCycles = 0 + + return TPResult(minTP, minTP_noDepBreaking_noLoop, minTP_single, unfused_uops, fused_uops, divCycles, 0, False, config, ports_dict) + else: + hasMemWriteOperand = len(instrNode.findall('./operand[@type="mem"][@r="1"][@w="1"]'))>0 + uops = 0 + uopsFused = 0 + divCycles = 0 + ILD_stalls = 0 + dec0 = False + ports_dict = {} + for config in configs: + if config.note: htmlReports.append('

    ' + config.note + '

    \n') + + instrIList = config.independentInstrs + for ic in sorted(set([1, min(4, len(instrIList)), min(8, len(instrIList)), len(instrIList)])): + if len(instrIList) > 1: htmlReports.append('

    With ' + str(ic) + ' independent instruction' + ('s' if ic>1 else '') + '

    \n') + htmlReports.append('
    ') + + for useDepBreakingInstrs in ([False, True] if config.depBreakingInstrs else [False]): + if useDepBreakingInstrs: + instrStr = ';'.join([config.depBreakingInstrs+';'+config.preInstrCode+';'+i.asm for i in instrIList[0:ic]]) + htmlReports.append('

    With additional dependency-breaking instructions

    \n') + else: + instrStr = ';'.join([config.preInstrCode+';'+i.asm for i in instrIList[0:ic]]) + + init = list(chain.from_iterable(i.regMemInit for i in instrIList[0:ic])) + config.init + + for repType in ['unrollOnly', 'loopSmall', 'loopBig']: + if minTP < sys.maxint and minTP > 100: continue + + if repType == 'unrollOnly': + unrollCount = int(round(500/ic+49, -2)) # should still fit in the icache + if instrNode.attrib['iclass'] in ['WBINVD']: unrollCount /= 10; + loopCount = 0 + else: + # we test with a small loop body so that uops may be delivered from the loop stream detector (LSD) + # we also test with a larger loop body to minimize potential overhead from the loop itself + loopCount = 100; + unrollCount = max(1, int(round(10.0/ic))) + if repType == 'loopBig': + unrollCount *= 10 + + if loopCount > 0: + htmlReports.append('

    With loop_count=' + str(loopCount) + ' and unroll_count=' + str(unrollCount) + '

    \n') + else: + htmlReports.append('

    With unroll_count=' + str(unrollCount) +' and no inner loop

    \n') + + htmlReports.append('\n') + + cycles = fancyRound(result['Core cycles']/ic) + + invalid = False + if any('PORT' in e for e in result): + maxPortUops = max(v/(len(e)-9) for e,v in result.items() if e.startswith('UOPS_PORT')) + if maxPortUops * .98 > result['Core cycles']: + print 'More uops on ports than cycles, uops: {}, cycles: {}'.format(maxPortUops, result['Core cycles']) + invalid = True + + if not invalid: + minTP = min(minTP, cycles) + if not useDepBreakingInstrs and repType == 'unrollOnly': minTP_noDepBreaking_noLoop = min(minTP_noDepBreaking_noLoop, cycles) + + if ic == 1 and (minTP == sys.maxint or cycles == minTP) and not useDepBreakingInstrs and repType == 'unrollOnly': + minTP_single = min(minTP_single, cycles) + + if isIntelCPU(): + ports_dict = {int(p[9:]): i for p, i in result.items() if 'UOPS_PORT' in p} + elif isAMDCPU() and not instrNode.attrib['extension'] == 'BASE': + # We ignore BASE instructions, as they sometimes wrongly count floating point uops + ports_dict = {int(p[23:]): i for p, i in result.items() if 'FpuPipeAssignment.Total' in p} + + uops = int(result['UOPS']+.1) + if 'RETIRE_SLOTS' in result: + uopsFused = int(result['RETIRE_SLOTS']+.1) + + if 'ILD_STALL.LCP' in result: + ILD_stalls = int(result['ILD_STALL.LCP']) + + if 'INST_DECODED.DEC0' in result: + dec0 = (int(round(result['INST_DECODED.DEC0'])) > 0) + + if 'DIV_CYCLES' in result: + divCycles = int(result['DIV_CYCLES']+.1) + + minConfig = config + + htmlReports.append('
    ') + + if minTP < sys.maxint: + return TPResult(minTP, minTP_noDepBreaking_noLoop, minTP_single, uops, uopsFused, divCycles, ILD_stalls, dec0, minConfig, ports_dict) + + +basicLatency = {} + +def getBasicLatencies(instrNodeList): + movsxResult = runExperiment(instrNodeDict['MOVSXD (R64, R32)'], 'MOVSX RAX, EAX') + movsxCycles = int(round(movsxResult['Core cycles'])) + if not movsxCycles == 1: + print 'Latency of MOVSX must be 1' + sys.exit() + basicLatency['MOVSX'] = movsxCycles + + andResult = runExperiment(instrNodeDict['AND_21 (R64, R64)'], 'AND RAX, RBX') + basicLatency['AND'] = int(round(andResult['Core cycles'])) + + orResult = runExperiment(instrNodeDict['OR_09 (R64, R64)'], 'OR RAX, RBX') + basicLatency['OR'] = int(round(orResult['Core cycles'])) + + xorResult = runExperiment(instrNodeDict['XOR_31 (R64, R64)'], 'XOR RAX, RBX') + basicLatency['XOR'] = int(round(xorResult['Core cycles'])) + + cmcResult = runExperiment(instrNodeDict['CMC'], 'CMC') + basicLatency['CMC'] = int(round(cmcResult['Core cycles'])) + + movqResult = runExperiment(instrNodeDict['MOVQ_0F6F (MM, MM)'], 'MOVQ MM0, MM0') + basicLatency['MOVQ'] = int(round(movqResult['Core cycles'])) + + for flag in STATUSFLAGS_noAF: + testSetResult = runExperiment(None, 'TEST AL, AL; SET' + flag[0] + ' AL') + testSetCycles = int(round(testSetResult['Core cycles'])) + if not testSetCycles == 2: + print 'Latencies of TEST and SET' + flag[0] + ' must be 1' + sys.exit() + basicLatency['SET' + flag[0]] = 1 + basicLatency['TEST'] = 1 + + testCmovResult = runExperiment(None, 'TEST RAX, RAX; CMOV' + flag[0] + ' RAX, RAX') + basicLatency['CMOV' + flag[0]] = int(round(testCmovResult['Core cycles'])) - 1 + + for instr in ['ANDPS', 'ANDPD', 'ORPS', 'ORPD', 'PAND', 'POR']: + result = runExperiment(instrNodeDict[instr + ' (XMM, XMM)'], instr + ' XMM1, XMM1') + basicLatency[instr] = int(round(result['Core cycles'])) + + for instr in ['PSHUFD', 'SHUFPD']: + result = runExperiment(instrNodeDict[instr + ' (XMM, XMM, I8)'], instr + ' XMM1, XMM1, 0') + basicLatency[instr] = int(round(result['Core cycles'])) + + if filter(lambda x: x.findall('[@iclass="VANDPS"]'), instrNodeList): + for instr in ['VANDPS', 'VANDPD', 'VORPS', 'VORPD', 'VPAND', 'VPOR']: + result = runExperiment(instrNodeDict[instr + ' (XMM, XMM, XMM)'], instr + ' XMM1, XMM1, XMM1') + basicLatency[instr] = int(round(result['Core cycles'])) + + for instr in ['VSHUFPD']: + result = runExperiment(instrNodeDict[instr + ' (XMM, XMM, XMM, I8)'], instr + ' XMM1, XMM1, XMM1, 0') + basicLatency[instr] = int(round(result['Core cycles'])) + + for instr in ['VPSHUFD']: + result = runExperiment(instrNodeDict[instr + ' (XMM, XMM, I8)'], instr + ' XMM1, XMM1, 0') + basicLatency[instr] = int(round(result['Core cycles'])) + + if filter(lambda x: x.findall('[@extension="AVX512EVEX"]'), instrNodeList): + kmovq_result = runExperiment(instrNodeDict['KMOVQ (K, K)'], 'KMOVQ K1, K1') + basicLatency['KMOVQ'] = int(round(kmovq_result['Core cycles'])) + + for regType in ['XMM', 'YMM', 'ZMM']: + vmovups_result = runExperiment(instrNodeDict['VMOVUPS ({0}, K, {0})'.format(regType)], 'VMOVUPS ' + regType + '1 {k1}, ' + regType + '1') + vmovups_cycles = int(round(vmovups_result['Core cycles'])) + vmovups_uops = int(round(vmovups_result['UOPS'])) + basicLatency['VMOVUPS_' + regType + '_' + 'K'] = vmovups_cycles + + if not vmovups_uops == 1: + print 'VMOVUPS must have exactly 1 uop' + sys.exit() + + vpmovq2m_result = runExperiment(instrNodeDict['VPMOVQ2M (K, ' + regType + ')'], + 'VPMOVQ2M K1, ' + regType + '1; VMOVUPS ' + regType + '1 {k1}, ' + regType + '1') + basicLatency['VPMOVQ2M_'+regType] = int(round(vpmovq2m_result['Core cycles'])) - vmovups_cycles + + vptestnmq_result = runExperiment(instrNodeDict['VPTESTNMQ (K, K, {0}, {0})'.format(regType)], + 'VPTESTNMQ K1 {K1}, ' + regType + '1, ' + regType + '1; VMOVUPS ' + regType + '1 {k1}, ' + regType + '1') + basicLatency['VPTESTNMQ_'+regType] = int(round(vptestnmq_result['Core cycles'])) - vmovups_cycles + + for memWidth in [8, 16, 32, 64]: + reg = regToSize('R12', memWidth) + mov_10movsx_mov_result = runExperiment(None, 'mov ' + reg + ', [r14];' + ';'.join(10*['MOVSX R12, R12w']) + '; mov [r14], ' + reg , unrollCount=100) + basicLatency['MOV_10MOVSX_MOV_'+str(memWidth)] = int(round(mov_10movsx_mov_result['Core cycles'])) + + print 'Basic Latencies: ' + str(basicLatency) + +# Returns a dict {opNode: instr}, s.t. opNode is both read and written, and instr breaks the dependency +# Returns a list of dependency breaking instructions for operands that are both read and written (with the exception of ignoreOperand, if specified). +def getDependencyBreakingInstrs(instrNode, opRegDict, ignoreOperand = None): + depBreakingInstrs = dict() + for opNode in instrNode.findall('./operand[@type="reg"][@r="1"][@w="1"]'): + if opNode == ignoreOperand: continue + + xtype = opNode.attrib.get('xtype', '') + opI = int(opNode.attrib['idx']) + if opI in opRegDict: + reg = opRegDict[opI] + elif opNode.attrib.get('suppressed', '0') == '1': + reg = opNode.text + regPrefix = re.sub('\d', '', reg) + if reg in GPRRegs: + if reg not in globalDoNotWriteRegs: + depBreakingInstrs[opNode] = 'MOV ' + reg + ', 0' # don't use XOR as this would also break flag dependencies + elif reg in ['RSP', 'RBP']: + depBreakingInstrs[opNode] = 'MOV ' + reg + ', R14' + elif xtype.startswith('f'): + if isAVXInstr(instrNode): + depBreakingInstrs[opNode] = 'VMOVUPD ' + reg + ', ' + regPrefix + '15' + else: + depBreakingInstrs[opNode] = 'MOVUPD ' + reg + ', ' + regPrefix + '15' + elif regPrefix in ['XMM', 'YMM', 'ZMM'] and isAVXInstr(instrNode): + depBreakingInstrs[opNode] = 'VXORPS ' + reg + ', ' + reg + ', ' + reg + elif 'MM'in regPrefix: + depBreakingInstrs[opNode] = 'PXOR ' + reg + ', ' + reg + for opNode in instrNode.findall('./operand[@type="mem"][@r="1"][@w="1"]'): + if opNode == ignoreOperand: continue + + memWidth = int(opNode.attrib['width']) + if memWidth <= 64: + depBreakingInstrs[opNode] = 'MOV ' + opNode.attrib['memory-prefix'] + ' [' + getAddrReg(instrNode, opNode) + '], 0' + else: + depBreakingInstrs[opNode] = 'MOVUPS [' + getAddrReg(instrNode, opNode) + '], XMM15' + for opNode in instrNode.findall('./operand[@type="flags"][@w="1"]'): + if opNode == ignoreOperand: continue + if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue + + if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue + depBreakingInstrs[opNode] = 'TEST R13, R13' + + return depBreakingInstrs + + +# Returns an assembler code string of dependency breaking instructions for suppressed operands (and operands with only one possible register) of instrNode. +def getDependencyBreakingInstrsForSuppressedOperands(instrNode): + if instrNode.attrib['iclass'] in ['LEAVE']: return '' + if instrNode.attrib['iclass'] in ['XLAT']: return 'XOR RAX, RAX' + + depBreakingInstrs = [] + xorInDepBreakingInstrs = False + for opNode in instrNode.findall('./operand[@type="reg"][@r="1"]'): + if opNode.attrib.get('suppressed', '0') == '0' and ',' in opNode.text: continue + + reg = opNode.text + if not reg in GPRRegs: continue + if reg in globalDoNotWriteRegs|specialRegs: continue + + writeOfRegFound = False + for opNode2 in instrNode.findall('./operand[@type="reg"][@w="1"]'): + if opNode2.attrib.get('suppressed', '0') == '0' and ',' in opNode2.text: continue + reg2 = opNode2.text + if regTo64(reg) == regTo64(reg2): + writeOfRegFound = True + break + + if writeOfRegFound: + # we use the corresponding 64-bit register, as dependency breaking doesn't seem to work for reg sizes <= 16 + depBreakingInstrs += ['XOR ' + regTo64(reg) + ', ' + regTo64(reg)] + xorInDepBreakingInstrs = True + for opNode in instrNode.findall('./operand[@type="mem"][@r="1"][@w="1"][@suppressed="1"]'): + depBreakingInstrs += ['MOV qword ptr [' + opNode.attrib['base'] + '], 0'] + if not xorInDepBreakingInstrs: + for opNode in instrNode.findall('./operand[@type="flags"][@w="1"]'): + # on some CPUs, instructions that write flags conditionally also read the flags + if not (opNode.attrib.get('r', '') == '1' or opNode.attrib.get('conditionalWrite', '') == '1'): continue + if not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): continue + depBreakingInstrs += ['TEST R13, R13'] + + return ';'.join(depBreakingInstrs) + + +#constants are from Agner Fog's scripts +def getDivLatConfigLists(instrNode, opNode1, opNode2, cRep): + if instrNode.attrib['iclass'] in ['DIV', 'IDIV']: + for op in instrNode.iter('operand'): + if op.attrib.get('suppressed', '0') == '0': + divisorNode = op + memDivisor = divisorNode.attrib['type'] == 'mem' + width = int(divisorNode.attrib['width']) + + if memDivisor: + instrI = getInstrInstanceFromNode(instrNode) + else: + divisorReg = regToSize('RBX', width) + instrI = getInstrInstanceFromNode(instrNode, opRegDict={int(divisorNode.attrib['idx']):divisorReg}) + + if width == 8: + maxRAX = '13057' + maxRDX = '0' + maxDivisor = '123' + elif width == 16: + maxRAX = '133' + maxRDX = '0x343a' + maxDivisor = '0x75e6' + elif width == 32: + maxRAX = '133' + maxRDX = '0x343a9ed7' + maxDivisor = '0x75e6e44f' + elif width == 64: + maxRAX = '133' + maxRDX = '0x343a9ed744556677' + maxDivisor = '0x75e6e44fccddeeff' + + minRAX = '0' + minRDX = '0' + minDivisor = '1' + + configLists = [] + for RAX, RDX, divisor in [(minRAX, minRDX, minDivisor), (maxRAX, maxRDX, maxDivisor)]: + configList = LatConfigList() + configLists.append(configList) + + config = LatConfig(instrI) + + if RAX == maxRAX: + config.notes.append('slow division') + else: + config.notes.append('fast division') + + immReg = {'RAX': 'R8', 'RDX': 'R9', 'divisor': 'R10'} + config.init = ['MOV ' + immReg['RAX'] + ', ' + RAX, + 'MOV ' + immReg['RDX'] + ', ' + RDX, + 'MOV ' + immReg['divisor'] + ', ' + divisor] + + if memDivisor: + config.init += ['MOV [R14], ' + immReg['divisor']] + else: + config.init += ['MOV RBX, ' + immReg['divisor']] + config.init += ['MOV RAX, ' + immReg['RAX'], + 'MOV RDX, ' + immReg['RDX']] + + chainInstrs = '' + chainLatency = 0 + immInstr = '' + for opNode in instrNode.iter('operand'): + if opNode.attrib['type'] == 'flags': continue + if opNode == opNode1: + if opNode == divisorNode: + if memDivisor: + chainInstrs = 'MOVSX R12, ' + regToSize(opNode2.text, min(getRegSize(opNode2.text), 32)) + '; ' + chainInstrs += ('XOR R14, R12; ') * cRep # cRep is a power of two + chainLatency = basicLatency['MOVSX'] + basicLatency['XOR'] * cRep + else: + chainInstrs = 'AND {0}, {1}; AND {0}, {2}; OR {0}, {2}; '.format(divisorReg, regToSize(opNode2.text, getRegSize(divisorReg)), + regToSize(immReg['divisor'], getRegSize(divisorReg))) + chainInstrs += 'OR {0}, {0}; '.format(divisorReg) * cRep + chainLatency = basicLatency['AND'] * 2 + basicLatency['OR'] * (cRep+1) + else: + chainInstrs = 'AND {0}, {1}; OR {0}, {1}; '.format(opNode.text, regToSize(immReg[regTo64(opNode.text)], getRegSize(opNode.text))) + chainInstrs += 'OR {0}, {0}; '.format(opNode.text) * cRep + chainLatency = basicLatency['AND'] + basicLatency['OR'] * (cRep+1) + + if opNode != opNode2: + chainInstrs = 'AND ' + opNode.text + ', ' + opNode2.text + '; ' + chainInstrs + chainLatency += basicLatency['AND'] + + elif opNode != divisorNode: + immInstr += 'MOV ' + opNode.text + ', ' + regToSize(immReg[regTo64(opNode.text)], getRegSize(opNode.text)) + ';' + + config.chainInstrs = chainInstrs + '; ' + immInstr + config.chainLatency = chainLatency + configList.append(config) + return configLists + elif instrNode.attrib['iclass'] in ['DIVSS', 'DIVPS', 'DIVSD', 'DIVPD', 'VDIVSS', 'VDIVPS', 'VDIVSD', 'VDIVPD']: + memDivisor = len(instrNode.findall('./operand[@type="mem"]'))>0 + dataType = instrNode.attrib['iclass'][-1] + + if dataType == 'S': + maxDividend = '0x54ed392654ed3926' # 8.15093E12 in high and low 32-bit + maxDivisor = '0x3f99f4c03f99f4c0' # 1.20278 in high and low 32-bit + minDividend = '0x3f8000003f800000' # 1.0 in high and low 32-bit + minDivisor = '0x3f8000003f800000' # 1.0 in high and low 32-bit + else: + maxDividend = '0x429da724b687da66' # 8.1509281715106E12 + maxDivisor = '0x3ff33e97f934078b' # 1.20278165192619 + minDividend = '0x3ff0000000000000' # 1.0 + minDivisor = '0x3ff0000000000000' # 1.0 + + configLists = [] + for dividend, divisor in [(maxDividend, maxDivisor), (minDividend, minDivisor)]: + configList = LatConfigList() + configLists.append(configList) + + regType = 'XMM' + if 'YMM' in instrNode.attrib['iform']: regType = 'YMM' + if 'ZMM' in instrNode.attrib['iform']: regType = 'ZMM' + + init = ['MOV RAX, ' + dividend] + init += ['MOV RBX, ' + divisor] + for i in range(0, getRegSize(regType)/8, 8): init += ['MOV [R14+' + str(i) + '], RBX'] + for i in range(64, 64+getRegSize(regType)/8, 8): init += ['MOV [R14+' + str(i) + '], RAX'] + + if instrNode.attrib['iclass'] in ['DIVSS', 'DIVPS', 'DIVSD', 'DIVPD']: + init += ['MOVUP' + dataType + ' XMM1, [R14+64]'] + init += ['MOVUP' + dataType + ' XMM2, [R14]'] + init += ['MOVUP' + dataType + ' XMM3, [R14+64]'] + init += ['MOVUP' + dataType + ' XMM4, [R14]'] + + instrI = getInstrInstanceFromNode(instrNode, opRegDict={1:'XMM3', 2:'XMM4'}) + + if opNode1 == opNode2: + if dividend == minDividend: + # some CPUs seem to have some bypass delay when using (V)ORP*; we additionally test them with the same reg for both operands + instrISameReg = getInstrInstanceFromNode(instrNode, opRegDict={1:'XMM3', 2:'XMM3'}) + config = LatConfig(instrISameReg, init=init) + configList.append(config) + + config = LatConfig(instrI, init=init) + if dividend == maxDividend: + config.chainInstrs = 'ORP{0} XMM3, XMM1; ANDP{0} XMM3, XMM1; '.format(dataType) + config.chainLatency = basicLatency['ORP' + dataType] + basicLatency['ANDP' + dataType] + config.chainInstrs += 'ORP{} XMM3, XMM3;'.format(dataType) * cRep + config.chainLatency += basicLatency['ORP' + dataType] * cRep + configList.append(config) + configList.isUpperBound = True + else: + if memDivisor: + configList.isUpperBound = True + # find all other instrs from XMM3 to R12 + for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, 'XMM3', 'R12'): + if dividend == maxDividend: + chainInstrs = chainInstrI.asm + '; MOVUP' + dataType + ' XMM3, XMM1; ' + else: + chainInstrs = chainInstrI.asm + '; ' + chainInstrs += ('XOR R14, R12; ') * cRep + chainLatency = 1 + basicLatency['XOR'] * cRep + configList.append(LatConfig(instrI, init=init, chainInstrs=chainInstrs, chainLatency=chainLatency)) + else: + if dividend == minDividend: + # some CPUs seem to have some bypass delay when using (V)ORP*; we additionally test them with the same reg for both operands + instrISameReg = getInstrInstanceFromNode(instrNode, opRegDict={1:'XMM3', 2:'XMM3'}) + config = LatConfig(instrISameReg, init=init) + configList.append(config) + + config = LatConfig(instrI, init=init) + if dividend == maxDividend: + config.chainInstrs = 'ANDP{0} XMM4, XMM3; MOVUP{0} XMM3, XMM1; ANDP{0} XMM4, XMM2; ORP{0} XMM4, XMM2; '.format(dataType) + else: + config.chainInstrs = 'ANDP{0} XMM4, XMM3; ANDP{0} XMM4, XMM2; ORP{0} XMM4, XMM2; '.format(dataType) + config.chainInstrs += 'ORP{} XMM4, XMM4; '.format(dataType) * cRep + config.chainLatency = basicLatency['ANDP' + dataType] * 2 + basicLatency['ORP' + dataType] * (cRep+1) + configList.append(config) + configList.isUpperBound = True + else: # instrNode.attrib['iclass'] in ['VDIVSS', 'VDIVPS', 'VDIVSD', 'VDIVPD']: + nOperands = len(instrNode.findall('./operand')) + + targetReg = regType + '0' + dividendBaseReg = regType + '1' + dividendReg = regType + '2' + divisorBaseReg = regType + '3' + divisorReg = regType + '4' + + init += ['VMOVUP' + dataType + ' ' + dividendBaseReg + ', [R14+64]'] + init += ['VMOVUP' + dataType + ' ' + dividendReg + ', [R14+64]'] + init += ['VMOVUP' + dataType + ' ' + divisorBaseReg + ', [R14]'] + init += ['VMOVUP' + dataType + ' ' + divisorReg + ', [R14]'] + + instrI = getInstrInstanceFromNode(instrNode, opRegDict={1:targetReg, (nOperands-1):dividendReg, nOperands:divisorReg}) + + if int(opNode1.attrib['idx']) == nOperands - 1: #dividend + if dividend == minDividend: + # some CPUs seem to have some bypass delay when using (V)ORP*; we additionally test them with the same reg for both operands + instrISameReg = getInstrInstanceFromNode(instrNode, opRegDict={1:dividendReg, (nOperands-1):dividendReg, nOperands:divisorReg}) + config = LatConfig(instrISameReg, init=init) + configList.append(config) + + config = LatConfig(instrI, init=init) + config.chainInstrs = 'VORP{0} {1}, {2}, {2}; VORP{0} {1}, {1}, {3}; VANDP{0} {1}, {1}, {3}; '.format(dataType, dividendReg, targetReg, dividendBaseReg) + config.chainInstrs += 'VORP{0} {1}, {1}, {1}; '.format(dataType, dividendReg) * cRep + config.chainLatency = basicLatency['VORP' + dataType] * (cRep+2) + basicLatency['VANDP' + dataType] + configList.append(config) + configList.isUpperBound = True + else: # divisor + if memDivisor: + configList.isUpperBound = True + # find all other instrs from targetReg to R12 + for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, targetReg, 'R12'): + chainInstrs = chainInstrI.asm + '; ' + ('XOR R14, R12; ') * cRep + chainLatency = 1 + basicLatency['XOR'] * cRep + configList.append(LatConfig(instrI, init=init, chainInstrs=chainInstrs, chainLatency=chainLatency)) + else: + if divisor == minDivisor: + # some CPUs seem to have some bypass delay when using (V)ORP*; we additionally test them with the same reg for both operands + instrISameReg = getInstrInstanceFromNode(instrNode, opRegDict={1:divisorReg, (nOperands-1):dividendReg, nOperands:divisorReg}) + config = LatConfig(instrISameReg, init=init) + configList.append(config) + + config = LatConfig(instrI, init=init) + config.chainInstrs = 'VORP{0} {1}, {2}, {2}; VORP{0} {1}, {1}, {3}; VANDP{0} {1}, {1}, {3}; '.format(dataType, divisorReg, targetReg, divisorBaseReg) + config.chainInstrs += 'VORP{0} {1}, {1}, {1}; '.format(dataType, divisorReg) * cRep + config.chainLatency = basicLatency['VORP' + dataType] * (cRep+2) + basicLatency['VANDP' + dataType] + configList.append(config) + configList.isUpperBound = True + return configLists + elif instrNode.attrib['iclass'] in ['SQRTSS', 'SQRTPS', 'SQRTSD', 'SQRTPD', 'RSQRTSS', 'RSQRTPS', 'RCPSS', 'RCPPS', 'VSQRTSS', 'VSQRTPS', 'VSQRTSD', + 'VSQRTPD','VRSQRTSS', 'VRSQRTPS', 'VRSQRT14PD', 'VRSQRT14PS', 'VRSQRT14SD', 'VRSQRT14SS', 'VRCPSS', 'VRCPPS']: + dataType = instrNode.attrib['iclass'][-1] + + if dataType == 'S': + maxArg = '0x72d30ff172d30ff1' # 8.36104E30 in high and low 32-bit + minArg = '0x3f8000003f800000' # 1.0 in high and low 32-bit + else: + maxArg = '0x465a61fe1acdc21c' # 8.3610378602352937E30 + minArg = '0x3ff0000000000000' # 1.0 + + instrPrefix = '' + if instrNode.attrib['iclass'].startswith('V'): instrPrefix = 'V' + + configLists = [] + + for arg in [maxArg, minArg]: + configList = LatConfigList() + configLists.append(configList) + + regType = 'XMM' + if 'YMM' in instrNode.attrib['iform']: regType = 'YMM' + if 'ZMM' in instrNode.attrib['iform']: regType = 'ZMM' + + init = ['MOV RAX, ' + arg] + for i in range(0, getRegSize(regType)/8, 8): init += ['MOV [R14+' + str(i) + '], RAX'] + + targetReg = regType + '0' + sourceBaseReg = regType + '1' + sourceReg = regType + '2' + + init += [instrPrefix + 'MOVUP' + dataType + ' ' + sourceReg + ', [R14]'] + init += [instrPrefix + 'MOVUP' + dataType + ' ' + sourceBaseReg + ', [R14]'] + + instrI = getInstrInstanceFromNode(instrNode, opRegDict={int(opNode2.attrib['idx']):targetReg, int(opNode1.attrib['idx']): sourceReg}) + + if opNode1.attrib['type'] == 'mem': + configList.isUpperBound = True + # find all other instrs from targetReg to R12 + for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, targetReg, 'R12'): + chainInstrs = chainInstrI.asm + '; ' + ('XOR R14, R12; ') * cRep + chainLatency = 1 + basicLatency['XOR'] * cRep + configList.append(LatConfig(instrI, init=init, chainInstrs=chainInstrs, chainLatency=chainLatency)) + else: + if arg == minArg: + # some CPUs seem to have some bypass delay when using (V)ORP*; we additionally test them with the same reg for both operands + instrISameReg = getInstrInstanceFromNode(instrNode, opRegDict={int(opNode2.attrib['idx']):sourceReg, int(opNode1.attrib['idx']): sourceReg}) + config = LatConfig(instrISameReg, init=init) + configList.append(config) + + config = LatConfig(instrI, init=init) + if instrPrefix == 'V': + config.chainInstrs = 'VORP{0} {1}, {2}, {2}; VORP{0} {1}, {1}, {3}; VANDP{0} {1}, {1}, {3}; '.format(dataType, sourceReg, targetReg, sourceBaseReg) + config.chainInstrs += 'VORP{0} {1}, {1}, {1}; '.format(dataType, sourceReg) * cRep + config.chainLatency = basicLatency['VORP' + dataType] * (cRep+2) + basicLatency['VANDP' + dataType] + else: + config.chainInstrs = 'ORP{0} {1}, {2}; ORP{0} {1}, {3}; ANDP{0} {1}, {3}; '.format(dataType, sourceReg, targetReg, sourceBaseReg) + config.chainInstrs += 'ORP{0} {1}, {1}; '.format(dataType, sourceReg) * cRep + config.chainLatency = basicLatency['ORP' + dataType] * (cRep+2) + basicLatency['ANDP' + dataType] + configList.append(config) + configList.isUpperBound = True + return configLists + +# finds chain instructions from startReg to targetReg (including cases where only part of a reg is read/written) +def getAllChainInstrsFromRegToReg(instrNode, startReg, targetReg): + allFPDataTypes = ['PD', 'PS', 'SD', 'SS'] + dataType = instrNode.attrib['iclass'][-2:] + if dataType not in allFPDataTypes: + dataType = '' + + result = [] + for chainInstrNode in instrNodeList: + if instrNode.attrib.get('vex', '0') != chainInstrNode.attrib.get('vex', '0'): continue + if instrNode.attrib.get('evex', '0') != chainInstrNode.attrib.get('evex', '0'): continue + + iclass = chainInstrNode.attrib['iclass'] + if dataType and any((d in iclass) for d in allFPDataTypes) and not dataType in iclass: continue + + for chainOpNode1 in chainInstrNode.findall('./operand[@type="reg"][@r="1"]'): + regs1 = [r for r in chainOpNode1.text.split(',') if (r in GPRRegs and startReg in GPRRegs and regTo64(startReg)==regTo64(r)) or + ((r not in GPRRegs) and startReg[1:] == r[1:] and getRegSize(r) <= getRegSize(startReg))] + if not regs1: continue + reg1 = regs1[0] + for chainOpNode2 in chainInstrNode.findall('./operand[@type="reg"][@w="1"]'): + regs2 = [r for r in chainOpNode2.text.split(',') if r!=reg1 and ((r in GPRRegs and targetReg in GPRRegs and regTo64(targetReg)==regTo64(r)) or + ((r not in GPRRegs) and targetReg[1:] == r[1:] and getRegSize(r) <= getRegSize(targetReg)))] + if not regs2: continue + reg2 = regs2[0] + result.append(getInstrInstanceFromNode(chainInstrNode, [reg1, reg2], [reg1, reg2], True, {int(chainOpNode1.attrib['idx']):reg1, int(chainOpNode2.attrib['idx']):reg2})) + return result + + +def getLatConfigsFromMemToReg(instrNode, instrI, memOpNode, targetReg, addrReg, cRep): + result = [] + + if targetReg.startswith('MM'): + result.append(LatConfig(instrI, chainInstrs='MOVQ ' + targetReg + ', [' + addrReg + '];', chainLatency=1)) + elif 'MM' in targetReg: + memWidth = int(memOpNode.attrib['width']) + + if memWidth == 32: + chainInstrFP = 'MOVSS' + chainInstrInt = 'MOVD' + elif memWidth == 64: + chainInstrFP = 'MOVSD' + chainInstrInt = 'MOVQ' + else: + chainInstrFP = 'MOVUPD' + chainInstrInt = 'MOVDQU' + + if isAVXInstr(instrNode): + chainInstrFP = 'V' + chainInstrFP + chainInstrInt = 'V' + chainInstrInt + + chainInstrFP = chainInstrFP + ' XMM13, [' + addrReg + '];' + fillInstrFP, fillLatFP = getChainInstrForVectorRegs(instrNode, 'XMM13', 'XMM' + targetReg[3:], cRep, 'FP') + result.append(LatConfig(instrI, chainInstrs=chainInstrFP+fillInstrFP, chainLatency=1+fillLatFP)) + + if not (targetReg[0:3] == 'YMM' and instrNode.attrib['extension'] == 'AVX'): # integers in YMM registers are only supported by AVX>=2 + chainInstrInt = chainInstrInt + ' XMM13, [' + addrReg + '];' + fillInstrInt, fillLatInt = getChainInstrForVectorRegs(instrNode, 'XMM13', 'XMM' + targetReg[3:], cRep, 'Int') + result.append(LatConfig(instrI, chainInstrs=chainInstrInt+fillInstrInt, chainLatency=1+fillLatInt)) + else: + for chainInstrNode in instrNodeList: + if instrNode.attrib.get('vex', '0') != chainInstrNode.attrib.get('vex', '0'): continue + if instrNode.attrib.get('evex', '0') != chainInstrNode.attrib.get('evex', '0'): continue + + for chainOpNode1 in chainInstrNode.findall('./operand[@type="mem"][@r="1"]'): + if chainOpNode1.attrib.get('suppressed', '0') == '1': continue + if memOpNode.attrib['width'] != chainOpNode1.attrib['width']: continue + if memOpNode.attrib.get('VSIB', '') != chainOpNode1.attrib.get('VSIB', ''): continue + + for chainOpNode2 in filter(lambda x: targetReg in x.text.split(','), chainInstrNode.findall('./operand[@type="reg"][@w="1"]')): + if chainOpNode2.attrib.get('optional', '') == '1': continue + chainsInstr = getInstrInstanceFromNode(chainInstrNode, [targetReg], [targetReg], True, {int(chainOpNode2.attrib['idx']):targetReg}).asm + result.append(LatConfig(instrI, chainInstrs=chainsInstr, chainLatency=1)) + return result + +def getLatConfigsFromRegToMem(instrNode, instrI, reg, addrReg, memWidth, cRep): + result = [] + + if reg.startswith('MM'): + result.append(LatConfig(instrI, chainInstrs='MOVQ [' + addrReg + '], ' + reg + ';', chainLatency=1)) + elif 'MM' in reg: + if memWidth <= 32: + chainInstrFP = 'MOVSS' + chainInstrInt = 'MOVD' + regPrefix = 'XMM' + elif memWidth == 64: + chainInstrFP = 'MOVSD' + chainInstrInt = 'MOVQ' + regPrefix = 'XMM' + elif memWidth == 128: + chainInstrFP = 'MOVUPD' + chainInstrInt = 'MOVDQU' + regPrefix = 'XMM' + elif memWidth == 256: + chainInstrFP = 'MOVUPD' + chainInstrInt = 'MOVDQU' + regPrefix = 'YMM' + elif memWidth == 512: + chainInstrFP = 'MOVUPD' + chainInstrInt = 'MOVDQU64' + regPrefix = 'ZMM' + + if isAVXInstr(instrNode): + chainInstrFP = 'V' + chainInstrFP + chainInstrInt = 'V' + chainInstrInt + + reg1 = regPrefix + reg[3:] + reg2 = regPrefix + '13' + + chainInstrFP = chainInstrFP + ' [' + addrReg + '], ' + reg2 + ';' + fillInstrFP, fillLatFP = getChainInstrForVectorRegs(instrNode, reg1, reg2, cRep, 'FP') + result.append(LatConfig(instrI, chainInstrs=fillInstrFP+chainInstrFP, chainLatency=1+fillLatFP)) + + if not (regPrefix == 'YMM' and instrNode.attrib['extension'] == 'AVX'): # integers in YMM registers are only supported by AVX>=2 + chainInstrInt = chainInstrInt + ' [' + addrReg + '], ' + reg2 + ';' + fillInstrInt, fillLatInt = getChainInstrForVectorRegs(instrNode, reg1, reg2, cRep, 'Int') + result.append(LatConfig(instrI, chainInstrs=fillInstrInt+chainInstrInt, chainLatency=1+fillLatInt)) + else: + # ToDo + pass + return result + +def getChainInstrForVectorRegs(instrNode, startReg, targetReg, cRep, cType): + # We use (V)SHUFPD instead of (V)MOV*PD because the latter is a 0-latency operation on some CPUs in some cases + if cType == 'FP': + if isAVXInstr(instrNode): + if arch in ['ZEN+', 'ZEN2']: + # on ZEN, all shuffles are integer operations + chainInstrFP = 'VANDPD {0}, {1}, {1};'.format(targetReg, startReg) + chainInstrFP += 'VANDPD {0}, {0}, {0};'.format(targetReg) * cRep + chainLatencyFP = basicLatency['VANDPD'] * (cRep+1) + else: + chainInstrFP = 'VSHUFPD {0}, {1}, {1}, 0;'.format(targetReg, startReg) + chainInstrFP += 'VSHUFPD {0}, {0}, {0}, 0;'.format(targetReg) * cRep + chainLatencyFP = basicLatency['VSHUFPD'] * (cRep+1) + else: + if arch in ['ZEN+', 'ZEN2']: + # on ZEN, all shuffles are integer operations + chainInstrFP = 'VANDPD {0}, {1}, {1};'.format(targetReg, startReg) + chainInstrFP += 'VANDPD {0}, {0}, {0};'.format(targetReg) * cRep + chainLatencyFP = basicLatency['VANDPD'] * (cRep+1) + else: + chainInstrFP = 'SHUFPD {}, {}, 0;'.format(targetReg, startReg) + chainInstrFP += 'SHUFPD {0}, {0}, 0;'.format(targetReg) * cRep + chainLatencyFP = basicLatency['SHUFPD'] * (cRep+1) + return (chainInstrFP, chainLatencyFP) + else: + if isAVXInstr(instrNode): + chainInstrInt = 'VPSHUFD {}, {}, 0;'.format(targetReg, startReg) + chainInstrInt += 'VPSHUFD {0}, {0}, 0;'.format(targetReg) * cRep + chainLatencyInt = basicLatency['VPSHUFD'] * (cRep+1) + else: + chainInstrInt = 'PSHUFD {}, {}, 0;'.format(targetReg, startReg) + chainInstrInt += 'PSHUFD {0}, {0}, 0;'.format(targetReg) * cRep + chainLatencyInt = basicLatency['PSHUFD'] * (cRep+1) + return (chainInstrInt, chainLatencyInt) + + +class LatConfig: + def __init__(self, instrI, chainInstrs='', chainLatency=0, init=None, notes=None): + self.instrI = instrI + self.chainInstrs = chainInstrs + self.chainLatency = chainLatency + self.init = ([] if init is None else init) + self.notes = ([] if notes is None else notes) + +class LatConfigList: + def __init__(self, latConfigs=None, sameReg = False, isUpperBound=False, notes=None): + self.latConfigs = ([] if latConfigs is None else latConfigs) + self.isUpperBound = isUpperBound + self.notes = ([] if notes is None else notes) + + def append(self, latConfig): + self.latConfigs.append(latConfig) + + def extend(self, latConfigs): + self.latConfigs.extend(latConfigs) + +LatResult = namedtuple('LatResult', ['minLat','maxLat','lat_sameReg','isUpperBound']) + +def getLatConfigLists(instrNode, startNode, targetNode, useDistinctRegs, addr_mem, tpDict): + cRep = min(100, 2 + 2 * int(math.ceil(tpDict[instrNode].TP_single / 2))) # must be a multiple of 2 + + if 'DIV' in instrNode.attrib['iclass'] or 'SQRT' in instrNode.attrib['iclass']: + if not useDistinctRegs: return None + if targetNode.attrib['type'] == 'flags': return None + if addr_mem == 'mem': return None + if startNode.attrib.get('opmask', '') == '1' or targetNode.attrib.get('opmask', '') == '1': return None + if instrNode.attrib.get('mask', '') == '1' and (startNode == targetNode): return None + return getDivLatConfigLists(instrNode, startNode, targetNode, cRep) + + init = [] + + startNodeIdx = int(startNode.attrib['idx']) + targetNodeIdx = int(targetNode.attrib['idx']) + + suppressedStart = startNode.attrib.get('suppressed', '0') == '1' + suppressedTarget = targetNode.attrib.get('suppressed', '0') == '1' + + instrReadsFlags = len(instrNode.findall('./operand[@type="flags"][@r="1"]')) > 0 + + configList = LatConfigList() + + if startNode.text == 'RSP' or targetNode.text == 'RSP': + # we ignore operands that modify the stack pointer, as these are usually handled by the stack engine in the issue stage of the pipeline, and + # thus would not lead to meaningful results + return None + elif (startNode.text and 'RIP' in startNode.text) or (targetNode.text and 'RIP' in targetNode.text): + return None + elif startNode.attrib['type'] == 'reg': + ################# + # reg -> ... + ################# + regs1 = set(startNode.text.split(","))-globalDoNotWriteRegs-specialRegs + + if not regs1: return None + + if targetNode.attrib['type'] == 'reg': + ################# + # reg -> reg + ################# + regs2 = set(targetNode.text.split(","))-globalDoNotWriteRegs-specialRegs + + if not regs2: + return None + + if startNode == targetNode: + reg1 = sortRegs(regs1)[0] + reg2 = reg1 + else: + if len(regs2) == 1: + reg2 = sortRegs(regs2)[0] + otherRegs = filter(lambda x: (x in GPRRegs and regTo64(x)!=regTo64(reg2)) or (x not in GPRRegs and x[1:]!=reg2[1:]), regs1) + if otherRegs: + reg1 = sortRegs(otherRegs)[0] + else: + reg1 = sortRegs(regs1)[0] + else: + reg1 = sortRegs(regs1)[0] + if not useDistinctRegs and reg1 in regs2: + reg2 = reg1 + else: + otherRegs = filter(lambda x: (x in GPRRegs and regTo64(x)!=regTo64(reg1)) or (x not in GPRRegs and x[1:]!=reg1[1:]), regs2) + if otherRegs: + reg2 = sortRegs(otherRegs)[0] + else: + reg2 = sortRegs(regs2)[0] + + instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=useDistinctRegs, opRegDict={startNodeIdx:reg1, targetNodeIdx:reg2}) + + if reg1 == reg2: + configList.append(LatConfig(instrI)) + + reg1Prefix = re.sub('\d', '', reg1) + reg2Prefix = re.sub('\d', '', reg2) + + if reg1 in GPRRegs and reg2 in GPRRegs: + # MOVSX avoids partial reg stalls and cannot be eliminated by "move elimination" + chainInstrs = 'MOVSX {}, {};'.format(regTo64(reg1), regToSize(reg2, min(32, getRegSize(reg2)))) + chainInstrs += 'MOVSX {}, {};'.format(regTo64(reg1), regTo32(reg1)) * cRep + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=basicLatency['MOVSX']*(cRep+1))) + elif reg1Prefix == 'K' and reg2Prefix == 'K': + chainInstr = 'KMOVQ {}, {};'.format(reg1, reg2) + chainInstr += 'KMOVQ {0}, {0};'.format(reg1) * cRep + configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=basicLatency['KMOVQ']*(cRep+1))) + elif reg1Prefix == 'K' and reg2Prefix[1:] == 'MM': + # we test with both VPMOVQ2M and VPTESTNMQ (as, e.g., VPMAXUB ZMM has a higher latency with the former for some unknown reason) + chainInstr1 = 'VPMOVQ2M ' + reg1 + ', ' + reg2 + ';' + configList.append(LatConfig(instrI, chainInstrs=chainInstr1, chainLatency=basicLatency['VPMOVQ2M_'+reg2Prefix])) + chainInstr2 = 'VPTESTNMQ ' + reg1 + ' {' + reg1 + '}, ' + reg2 + ', ' + reg2 + ';' + configList.append(LatConfig(instrI, chainInstrs=chainInstr2, chainLatency=basicLatency['VPTESTNMQ_'+reg2Prefix])) + elif reg1Prefix[1:] == 'MM' and reg2Prefix == 'K': + chainInstr = 'VMOVUPS ' + reg1 + ' {' + reg2 + '}, ' + reg1Prefix + '14;' + configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=basicLatency['VMOVUPS_'+reg1Prefix+'_K'])) + elif reg1Prefix[1:] == reg2Prefix[1:]: + # if the registers have different widths, bring the smaller to the width of the larger + reg1 = reg1.replace(reg1Prefix, min(reg1Prefix, reg2Prefix)) + reg2 = reg2.replace(reg2Prefix, min(reg1Prefix, reg2Prefix)) + + if reg1Prefix =='MM': + chainInstr = 'MOVQ {}, {};'.format(reg1, reg2) + chainInstr += 'MOVQ {0}, {0};'.format(reg1) * cRep + configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=basicLatency['MOVQ']*(cRep+1))) + elif reg1Prefix in ['XMM', 'YMM', 'ZMM']: + chainInstrFP, chainLatencyFP = getChainInstrForVectorRegs(instrNode, reg2, reg1, cRep, 'FP') + configList.append(LatConfig(instrI, chainInstrs=chainInstrFP, chainLatency=chainLatencyFP)) + + if not (reg1Prefix == 'YMM' and instrNode.attrib['extension'] == 'AVX'): # integers in YMM registers are only supported by AVX>=2 + chainInstrInt, chainLatencyInt = getChainInstrForVectorRegs(instrNode, reg2, reg1, cRep, 'Int') + configList.append(LatConfig(instrI, chainInstrs=chainInstrInt, chainLatency=chainLatencyInt)) + else: + print 'invalid reg prefix: ' + reg1Prefix + return None + else: + configList.isUpperBound = True + # find all other instrs from reg2 to reg1 + for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, reg2, reg1): + configList.append(LatConfig(instrI, chainInstrs=chainInstrI.asm, chainLatency=1)) + elif targetNode.attrib['type'] == 'flags': + ################# + # reg -> flags + ################# + + reg = sortRegs(regs1)[0] + + for flag in STATUSFLAGS_noAF: + if not ('flag_'+flag) in targetNode.attrib: continue + if not 'w' in targetNode.attrib[('flag_'+flag)]: continue + + if reg in GPRRegs: + regSize = getRegSize(reg) + if regSize == 8: + chainInstr = 'SET{} {};'.format(flag[0], reg) + chainLatency = basicLatency['SET' + flag[0]] + else: + chainInstr = 'CMOV{} {}, {};'.format(flag[0], reg, regToSize('R15', regSize)) + chainLatency = basicLatency['CMOV' + flag[0]] + instrI = getInstrInstanceFromNode(instrNode, ['R15'], ['R15'], useDistinctRegs, {startNodeIdx:reg}) + + movsxInstr = 'MOVSX {}, {};'.format(regTo64(reg), regToSize(reg, min(32, regSize))) + chainInstrs = chainInstr + movsxInstr * cRep + chainLatency = chainLatency + basicLatency['MOVSX'] * cRep + + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + elif 'MM' in reg: + instrI = getInstrInstanceFromNode(instrNode, ['R13', 'R15'], ['R13', 'R15'], True, {startNodeIdx:reg}) + configList.isUpperBound = True + for chainInstrI in getAllChainInstrsFromRegToReg(instrNode, 'R13', reg): + chainInstrs = 'CMOV' + flag[0] + ' R13, R15; ' + chainInstrI.asm + chainLatency = basicLatency['CMOV' + flag[0]] + 1 + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + elif targetNode.attrib['type'] == 'mem': + ################# + # reg -> mem + ################# + + reg = sortRegs(regs1)[0] + addrReg = getAddrReg(instrNode, targetNode) + + if reg in GPRRegs: + instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=useDistinctRegs, opRegDict={startNodeIdx:reg}) + + configList.isUpperBound = True + chainInstrs = 'MOV {}, [{}];'.format(reg, addrReg) + chainInstrs += 'MOVSX {}, {};'.format(regTo64(reg), regToSize(reg, min(32, getRegSize(reg)))) * cRep + chainLatency = int(basicLatency['MOV_10MOVSX_MOV_'+str(getRegSize(reg))] >= 12) # 0 if CPU supports zero-latency store forwarding + chainLatency += basicLatency['MOVSX'] * cRep + + if re.search('BT.*MEMv_GPRv', instrNode.attrib['iform']): + chainInstrs += 'AND ' + reg + ', 0;' + chainLatency += basicLatency['AND'] + + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + elif 'MM' in reg: + if suppressedTarget: + # ToDo: only happens in the case of maskmovdqu + pass + else: + instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=True, opRegDict={startNodeIdx:reg}) + configList.isUpperBound = True + configList.extend(getLatConfigsFromMemToReg(instrNode, instrI, targetNode, reg, addrReg, cRep)) + else: + # ToDo + print 'unsupported reg to mem' + return None + elif startNode.attrib['type'] == 'flags': + ################# + # flags -> ... + ################# + if targetNode.attrib['type'] == 'reg': + ################# + # flags -> reg + ################# + regs = set(targetNode.text.split(','))-globalDoNotWriteRegs-specialRegs + if not regs: return None + + reg = sortRegs(regs)[0] + + if reg in GPRRegs: + instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=useDistinctRegs, opRegDict={targetNodeIdx:reg}) + chainInstrs = 'TEST {0}, {0};'.format(reg) + chainLatency = basicLatency['TEST'] + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + + chainInstrs = 'MOVSX {}, {};'.format(regTo64(reg), regToSize(reg, min(32, getRegSize(reg)))) * cRep + chainInstrs + chainLatency += basicLatency['MOVSX'] * cRep + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + else: + # ToDo: there is no instruction from flag to vector reg; the only non-GPR that is possible are ST(0) and X87STATUS + return None + elif targetNode.attrib['type'] == 'flags': + ################# + # flags -> flag + ################# + + instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=useDistinctRegs) + configList.append(LatConfig(instrI)) + + cfModifiers = startNode.attrib.get('flag_CF', '') + if 'r' in cfModifiers and 'w' in cfModifiers: + chainInstrs = 'CMC;'*cRep + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=basicLatency['CMC']*cRep)) + elif targetNode.attrib['type'] == 'mem': + ################# + # flags -> mem + ################# + instrI = getInstrInstanceFromNode(instrNode, useDistinctRegs=useDistinctRegs) + chainInstr = 'TEST ' + targetNode.attrib['memory-prefix'] + ' [' + getAddrReg(instrNode, targetNode) + '], 1' + configList.isUpperBound = True + configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=1)) + elif startNode.attrib['type'] == 'mem': + ################# + # mem -> ... + ################# + if startNode.attrib.get('r', '0') == '0' and targetNode != startNode: + # for memory writes, only the dependency address -> memory is interesting + return None + + addrReg = getAddrReg(instrNode, startNode) + memWidth = int(startNode.attrib['width']) + + if targetNode.attrib['type'] == 'reg': + ################# + # mem -> reg + ################# + regs = set(targetNode.text.split(",")) + if not suppressedTarget: regs -= globalDoNotWriteRegs | specialRegs + if not regs: return None + reg = sortRegs(regs)[0] + regSize = getRegSize(reg) + + if suppressedStart: + if not regs.issubset(GPRRegs): + print 'read from suppressed mem to non-GPR reg not yet supported' + return None + + if reg in GPRRegs: + instrI = getInstrInstanceFromNode(instrNode, [addrReg, 'R12'], [addrReg, 'R12'], useDistinctRegs, {targetNodeIdx:reg}) + + if addr_mem == 'addr': + # addr -> reg + chainInstrs = 'MOVSX ' + regTo64(reg) + ', ' + regToSize(reg, min(32, regSize)) + ';' + chainInstrs += 'XOR {}, {};'.format(addrReg, regTo64(reg)) * cRep + ('TEST R13, R13;' if instrReadsFlags else '') # cRep is a multiple of 2 + chainLatency = basicLatency['MOVSX'] + basicLatency['XOR'] * cRep + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + else: + # mem -> reg + configList = LatConfigList() + configList.isUpperBound = True + chainInstrs = 'MOVSX R12, {};'.format(regToSize(reg, min(32, regSize))) + chainInstrs += 'MOVSX R12, R12d;' * (cRep-1) + chainInstrs += 'mov [{}], {};'.format(addrReg, regToSize('R12', regSize)) + chainLatency = basicLatency['MOVSX'] * cRep + chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(regSize)] >= 12) # 0 if CPU supports zero-latency store forwarding + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + elif 'MM' in reg: + instrI = getInstrInstanceFromNode(instrNode, ['R12'], ['R12'], useDistinctRegs, {targetNodeIdx:reg}) + + if addr_mem == 'addr': + # addr -> reg + configList.isUpperBound = True + chainInstrs = 'MOVQ R12, {};'.format(getCanonicalReg(reg)) + if isAVXInstr(instrNode): + chainInstrs = 'V' + chainInstrs + chainInstrs += 'XOR {}, {};'.format(addrReg, 'R12') * cRep + ('TEST R13, R13;' if instrReadsFlags else '') # cRep is a multiple of 2 + chainLatency = 1 + basicLatency['XOR'] * cRep + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + elif addr_mem == 'addr_VSIB': + # addr_VSIB -> reg + configList.isUpperBound = True + chainInstrs = 'VANDPD {0}14, {0}14, {0}{1};'.format(startNode.attrib['VSIB'], reg[3:]) * cRep + chainLatency = basicLatency['VANDPD'] * cRep + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + else: + # mem -> reg + configList.isUpperBound = True + configList.extend(getLatConfigsFromRegToMem(instrNode, instrI, reg, addrReg, memWidth, cRep)) + elif targetNode.attrib['type'] == 'flags': + ################# + # mem -> flags + ################# + for flag in STATUSFLAGS_noAF: + if not ('flag_'+flag) in targetNode.attrib: continue + if not 'w' in targetNode.attrib[('flag_'+flag)]: continue + + instrI = getInstrInstanceFromNode(instrNode, [addrReg, 'R12'], [addrReg, 'R12'], useDistinctRegs) + + if addr_mem == 'addr': + # addr -> flag + chainInstr = 'CMOV' + flag[0] + ' ' + addrReg + ', ' + addrReg + chainLatency = basicLatency['CMOV' + flag[0]] + configList.append(LatConfig(instrI, chainInstrs=chainInstr, chainLatency=chainLatency)) + else: + # mem -> flag + if memWidth <= 64: + configList.isUpperBound = True + chainInstrs = 'CMOV' + flag[0] + ' R12, R12;' + chainInstrs += 'MOVSX R12, R12d;' * cRep + chainInstrs += 'mov [' + addrReg + '], ' + regToSize('R12', memWidth) + chainLatency = basicLatency['CMOV' + flag[0]] + basicLatency['MOVSX'] * cRep + chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(memWidth)] >= 12) # 0 if CPU supports zero-latency store forwarding + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + else: + # ToDo + pass + elif targetNode.attrib['type'] == 'mem': + ################# + # mem -> mem + ################# + if startNode == targetNode: + instrI = getInstrInstanceFromNode(instrNode, [addrReg, 'R12'], [addrReg, 'R12'], useDistinctRegs=useDistinctRegs) + + if addr_mem == 'addr': + # addr -> mem + configList.isUpperBound = True + chainInstrs = 'MOV ' + regToSize('R12', min(64, memWidth)) + ', [' + addrReg + '];' + chainInstrs += ('MOVSX R12, ' + regToSize('R12', min(32, memWidth)) + ';') * cRep + chainInstrs += 'XOR ' + addrReg + ', R12; XOR ' + addrReg + ', R12;' + ('TEST R13, R13;' if instrReadsFlags else '') + chainLatency = basicLatency['MOVSX'] * cRep + 2*basicLatency['XOR'] + chainLatency += int(basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))] >= 12) # 0 if CPU supports zero-latency store forwarding + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + else: + # mem -> mem + if startNode.attrib.get('r','0')=='1': + configList = LatConfigList() + configList.append(LatConfig(instrI)) + + if memWidth <= 64: + chainInstrs = 'MOV ' + regToSize('R12', min(64, memWidth)) + ', [' + addrReg + '];' + chainInstrs += ('MOVSX R12, ' + regToSize('R12', min(32, memWidth)) + ';')*10 + chainInstrs += ('MOV [' + addrReg + '], ' + regToSize('R12', min(64, memWidth))) + chainLatency = basicLatency['MOV_10MOVSX_MOV_'+str(min(64, memWidth))] + configList.append(LatConfig(instrI, chainInstrs=chainInstrs, chainLatency=chainLatency)) + else: + # ToDo + pass + else: + # ToDo + return None + + if not configList.latConfigs: return None + return [configList] + + +def getLatencies(instrNode, instrNodeList, tpDict, htmlReports): + if useIACA: + createIacaAsmFile("/tmp/ramdisk/asm.s", "", 0, getInstrInstanceFromNode(instrNode).asm) + + if iacaVersion == '2.1': + try: + subprocess.check_output(['as', '/tmp/ramdisk/asm.s', '-o', '/tmp/ramdisk/asm.o']) + iaca_lat = subprocess.check_output(iacaCMDLine + ['-analysis', 'LATENCY', '/tmp/ramdisk/asm.o'], stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + print "Error: " + e.output + return None + + if '!' in iaca_lat or not 'Latency' in iaca_lat: + print "IACA error" + return None + + latency = iaca_lat.split('\n')[3].split()[1] + + htmlReports.append('
    ' + iaca_lat + '
    \n') + + return latency + else: + if instrNode.attrib['iclass'] in ['CALL_NEAR', 'CALL_NEAR_MEMv', 'CLZERO', 'JMP', 'JMP_MEMv', 'RET_NEAR', 'RET_NEAR_IMMw', 'RDMSR', 'WRMSR', 'RDPMC', 'LEAVE', 'CPUID', 'POPF', 'POPFQ']: + return None; + if 'REP' in instrNode.attrib['iclass']: + return None; + if instrNode.attrib['category'] in ['IO', 'IOSTRINGOP', 'PKU']: + return None + + inputOpnds = [] + outputOpnds = [] + + for opNode in instrNode.iter('operand'): + if opNode.attrib['type'] == 'flags' and not any(('flag_'+f in opNode.attrib) for f in STATUSFLAGS_noAF): + continue + + if opNode.attrib.get('r', '0') == '1': + inputOpnds.append(opNode) + if opNode.attrib.get('w', '0') == '1': + outputOpnds.append(opNode) + if opNode.attrib['type'] == 'mem' and opNode.attrib.get('r', '0') == '0': + inputOpnds.append(opNode) # address of memory write + if opNode.attrib['type'] == 'reg' and opNode.attrib.get('conditionalWrite', '0') == '1': + inputOpnds.append(opNode) + + archNode = instrNode.find('./architecture[@name="' + arch + '"]') + measurementNode = archNode.find('./measurement') + + canUseSameRegForDifferentOpnds = False + if not 'GATHER' in instrNode.attrib['category'] and not 'SCATTER' in instrNode.attrib['category']: + for opNode1 in instrNode.findall('./operand[@type="reg"]'): + for opNode2 in instrNode.findall('./operand[@type="reg"]'): + if opNode2 == opNode1: continue + if opNode1.attrib.get('suppressed', '') == '1' or opNode2.attrib.get('suppressed', '') == '1': continue + if opNode1.text == opNode2.text: + canUseSameRegForDifferentOpnds = True + break + + overallMaxLat = 0 + + htmlHead = [] + htmlBottom = [] + + for opNode1 in inputOpnds: + opNode1Idx = int(opNode1.attrib['idx']) + + for opNode2 in outputOpnds: + opNode2Idx = int(opNode2.attrib['idx']) + latencyNode = None + + for addr_mem in (['addr', 'mem']+(['addr_VSIB'] if 'VSIB' in opNode1.attrib else []) if opNode1.attrib['type']=='mem' else ['']): + minLatDistinctRegs = 0 + maxLatDistinctRegs = 0 + + configI = 0 + for useDistinctRegs in ([True, False] if canUseSameRegForDifferentOpnds else [True]): + latConfigLists = getLatConfigLists(instrNode, opNode1, opNode2, useDistinctRegs, addr_mem, tpDict) + if latConfigLists is None: continue + + minLat = sys.maxint + maxLat = 0 + + minLatIsUpperBound = False + maxLatIsUpperBound = False + + configHtmlReports = [] + + for latConfigList in latConfigLists: + minLatForCurList = sys.maxint + + if not any((latConfig.init or latConfig.instrI.regMemInit) for latConfig in latConfigList.latConfigs): + # Test different register values for read-only registers + for readOnlyRegOpNode in instrNode.findall('./operand[@type="reg"][@r="1"]'): + if readOnlyRegOpNode == opNode1: continue + if readOnlyRegOpNode.attrib.get('w', '') == '1': continue + readOnlyRegOpNodeIdx = int(readOnlyRegOpNode.attrib['idx']) + for latConfig in list(latConfigList.latConfigs): + if not readOnlyRegOpNodeIdx in latConfig.instrI.opRegDict: + print 'readOnlyRegOpNodeIdx not found in opRegDict' + continue + reg = latConfig.instrI.opRegDict[readOnlyRegOpNodeIdx] + if not reg in GPRRegs or reg in globalDoNotWriteRegs or reg in specialRegs: continue + if any((opNode is not None) for opNode in instrNode.findall('./operand[@type="reg"][@w="1"]') + if regTo64(latConfig.instrI.opRegDict[int(opNode.attrib['idx'])]) == regTo64(reg)): continue + + latConfigList.latConfigs.remove(latConfig) + for regVal in ['0', '1', '2']: + newlatConfig = copy.deepcopy(latConfig) + newlatConfig.init += ['MOV ' + reg + ', ' + regVal] + newlatConfig.notes.append('with ' + reg + '=' + regVal) + latConfigList.latConfigs.append(newlatConfig) + + # Create a copy of each experiment with dependency-breaking instructions for all dependencies other than the dependency from opNode2 to + # opNode1 if there aren't sufficiently many fill instructions in the chain + if (not 'DIV' in instrNode.attrib['iclass'] and not 'SQRT' in instrNode.attrib['iclass'] and + not 'GATHER' in instrNode.attrib['category'] and not 'SCATTER' in instrNode.attrib['category']): + for latConfig in list(latConfigList.latConfigs): + if latConfig.chainLatency > tpDict[instrNode].TP_single: + continue + + depBreakingInstrs = getDependencyBreakingInstrs(instrNode, latConfig.instrI.opRegDict) + if not depBreakingInstrs: continue + + newlatConfig = copy.deepcopy(latConfig) + depBreakingAdded = False + for depOpNode in depBreakingInstrs: + depOpNodeIdx = int(depOpNode.attrib['idx']) + if (depOpNodeIdx in latConfig.instrI.opRegDict and opNode1Idx in latConfig.instrI.opRegDict + and latConfig.instrI.opRegDict[depOpNodeIdx] == latConfig.instrI.opRegDict[opNode1Idx]): + continue + elif depOpNode == opNode1 and opNode1 == opNode2: + continue + elif opNode1.attrib['type'] == 'flags' and depOpNode.attrib['type'] == 'flags': + continue + elif depOpNode == opNode1: + # ToDo: this might be unnecessary, if CMOVcc wouldn't use the same regs + newlatConfig.chainInstrs = depBreakingInstrs[depOpNode] + '; ' + latConfig.chainInstrs + depBreakingAdded = True + else: + if not latConfig.chainInstrs.endswith(depBreakingInstrs[depOpNode]): + newlatConfig.chainInstrs = latConfig.chainInstrs + ';' + depBreakingInstrs[depOpNode] + depBreakingAdded = True + if depBreakingAdded: + latConfigList.latConfigs.remove(latConfig) + latConfigList.latConfigs.append(latConfig) # order ... + newlatConfig.notes.append('with dependency-breaking instructions') + latConfigList.latConfigs.append(newlatConfig) + + # make sure that the mask for gather/scatter instruction is never empty + if instrNode.attrib['extension'] == 'AVX2GATHER': + for latConfig in latConfigList.latConfigs: + maskReg = latConfig.instrI.opRegDict[3] + if opNode1Idx == 3: + latConfig.chainInstrs += 'VPCMPEQD {0}, {0}, {0};'.format(maskReg[0:3] + '13') + if 'VSHUFPD' in latConfig.chainInstrs: + orInstr = 'VORPD' + else: + orInstr = 'VPOR' + latConfig.chainInstrs += '{0} {1}, {1}, {2};'.format(orInstr, maskReg, maskReg[0:3] + '13') + latConfig.chainLatency += basicLatency[orInstr] + else: + latConfig.chainInstrs += 'VPCMPEQD {0}, {0}, {0};'.format(maskReg) + elif instrNode.attrib['extension'] == 'AVX512EVEX' and ('GATHER' in instrNode.attrib['category'] or 'SCATTER' in instrNode.attrib['category']): + for latConfig in latConfigList.latConfigs: + maskReg = latConfig.instrI.opRegDict[2] + if opNode1Idx == 2: + # ToDo + pass + else: + latConfig.chainInstrs += 'VPCMPD {0}, {1}, {1}, 7;'.format(maskReg, 'XMM15') + + mlDP = sys.maxint + mlnoDP = sys.maxint + + for latConfig in latConfigList.latConfigs: + configI += 1 + configHtmlReports.append('

    Experiment ' + str(configI)) + if latConfig.notes or not useDistinctRegs: + configHtmlReports.append(' (' + ', '.join(latConfig.notes + + (['with the same register for different operands'] if not useDistinctRegs else [])) + ')') + configHtmlReports.append('

    \n') + + configHtmlReports.append('\n') + + if not measurementResult: + print 'no result found' + continue + + cycles = measurementResult['Core cycles'] + + cycles = int(cycles+.2) + + if latConfig.chainLatency: + cycles -= latConfig.chainLatency + + cycles = max(0, cycles) # for dep. breaking instructions (like XOR), cycles might be negative after subtracting chainLatency + + minLatForCurList = min(minLatForCurList, cycles) + + if minLatForCurList < minLat: + minLat = minLatForCurList + minLatIsUpperBound = latConfigList.isUpperBound + + if minLatForCurList > maxLat: + maxLat = minLatForCurList + maxLatIsUpperBound = latConfigList.isUpperBound + + if minLat > maxLat: continue + + if useDistinctRegs: + minLatDistinctRegs = minLat + maxLatDistinctRegs = maxLat + else: + if minLatDistinctRegs == minLat and maxLatDistinctRegs == maxLat: + htmlBottom.append('
    ') + htmlBottom += configHtmlReports + htmlBottom.append('
    ') + continue + + overallMaxLat = max(overallMaxLat, maxLat) + + if latencyNode is None: + latencyNode = SubElement(measurementNode, 'latency') + latencyNode.attrib['start_op'] = str(opNode1.attrib['idx']) + latencyNode.attrib['target_op'] = str(opNode2.attrib['idx']) + + suffix = ('_'+addr_mem if addr_mem else '') + ('_same_reg' if not useDistinctRegs else '') + if minLat == maxLat: + latencyNode.attrib['cycles'+suffix] = str(minLat) + if minLatIsUpperBound: + latencyNode.attrib['cycles'+suffix+'_is_upper_bound'] = '1' + else: + latencyNode.attrib['min_cycles'+suffix] = str(minLat) + if minLatIsUpperBound: + latencyNode.attrib['min_cycles'+suffix+'_is_upper_bound'] = '1' + latencyNode.attrib['max_cycles'+suffix] = str(maxLat) + if maxLatIsUpperBound: + latencyNode.attrib['max_cycles'+suffix+'_is_upper_bound'] = '1' + + summaryLine = latencyNodeToStr(latencyNode, not useDistinctRegs, addr_mem) + + h2ID = 'lat' + str(opNode1Idx) + '->' + str(opNode2Idx) + suffix + htmlHead.append('

    ' + summaryLine + '

    ') + if useDistinctRegs: htmlBottom.append('
    ') + htmlBottom.append('

    ' + summaryLine + '

    ') + htmlBottom.append('
    ') + htmlBottom += configHtmlReports + htmlBottom.append('
    ') + + addHTMLCodeForOperands(instrNode, htmlReports) + htmlReports.append('
    ') + htmlReports += htmlHead + htmlReports += htmlBottom + + return overallMaxLat + + +def isSSEInstr(instrNode): + extension = instrNode.attrib['extension'] + return 'SSE' in extension or extension in ['AES'] + + +def isAVXInstr(instrNode): + return ('vex' in instrNode.attrib or 'evex' in instrNode.attrib) + + +def writeHtmlFile(folder, instrNode, title, body): + filename = canonicalizeInstrString(instrNode.attrib['string']) + if useIACA: + filename += '-IACA' + iacaVersion + else: + filename += '-Measurements' + filename += '.html' + + folder = '/tmp/cpu-html/' + folder + if not os.path.exists(folder): + os.makedirs(folder) + with open(os.path.join(folder, filename), "w") as f: + f.write('\n' + '\n' + '' + title + '\n' + '\n' + '\n' + + body + + '\n' + '\n') + + +# returns list of xml instruction nodes +def filterInstructions(XMLRoot): + allInstrs = list(XMLRoot.iter('instruction')) + + instrSet = set(allInstrs) + for XMLInstr in allInstrs: + extension = XMLInstr.attrib['extension'] + isaSet = XMLInstr.attrib['isa-set'] + + # Future instruction set extensions + if extension in ['CET', 'RDPRU']: instrSet.discard(XMLInstr) + + # Not supported by assembler + if XMLInstr.attrib['iclass'] == 'NOP' and len(XMLInstr.findall('operand')) > 1: + instrSet.discard(XMLInstr) + if extension in ['WBNOINVD']: instrSet.discard(XMLInstr) + + # Only supported by VIA + if 'VIA_' in extension: + instrSet.discard(XMLInstr) + + # "no CPU available today has PTWRITE support" (https://software.intel.com/en-us/forums/intel-isa-extensions/topic/704356) + if extension in ['PT']: + instrSet.discard(XMLInstr) + + if useIACA: + if extension in ['AVX512VEX', 'AVX512EVEX'] and arch != 'SKX': instrSet.discard(XMLInstr) + # AMD + if extension in ['3DNOW', 'CLZERO', 'FMA4', 'MONITORX', 'SSE4a', 'SVM', 'TBM', 'XOP']: instrSet.discard(XMLInstr) + # Future instruction set extensions + if extension in ['CLDEMOTE', 'ENQCMD', 'MOVDIR', 'PCONFIG', 'WAITPKG']: instrSet.discard(XMLInstr) + if extension in ['AVX512EVEX'] and any(x in isaSet for x in ['4FMAPS', '4VNNIW', 'ER', 'PF']): instrSet.discard(XMLInstr) + if any(x in isaSet for x in ['BF16', 'BITALG', 'GFNI', 'VAES', 'VBMI2', 'VNNI', 'VP2INTERSECT', 'VPCLMULQDQ', 'VPOPCNTDQ']): instrSet.discard(XMLInstr) + + if useIACA: return list(instrSet) + + cpu = cpuid.CPUID() + + _, _, ecx1, edx1 = cpu(0x01) + _, ebx7, ecx7, edx7 = cpu(0x07) + eax7_1, _, _, _ = cpu(0x07, 0x01) + _, _, ecx8_1, edx8_1 = cpu(0x80000001) + _, ebx8_8, _, _ = cpu(0x80000008) + + for XMLInstr in allInstrs: + iclass = XMLInstr.attrib['iclass'] + extension = XMLInstr.attrib['extension'] + isaSet = XMLInstr.attrib['isa-set'] + category = XMLInstr.attrib['category'] + + if extension == 'SSE3' and not cpuid.get_bit(ecx1, 0): instrSet.discard(XMLInstr) + if extension == 'PCLMULQDQ' and not cpuid.get_bit(ecx1, 1): instrSet.discard(XMLInstr) + if extension == 'SSSE3' and not cpuid.get_bit(ecx1, 9): instrSet.discard(XMLInstr) + if extension == 'FMA' and not cpuid.get_bit(ecx1, 12): instrSet.discard(XMLInstr) + if extension == 'SSE4' and not cpuid.get_bit(ecx1, 19): instrSet.discard(XMLInstr) + if isaSet == 'SSE42' and not cpuid.get_bit(ecx1, 20): instrSet.discard(XMLInstr) + if extension == 'MOVBE' and not cpuid.get_bit(ecx1, 22): instrSet.discard(XMLInstr) + if isaSet == 'POPCNT' and not cpuid.get_bit(ecx1, 23): instrSet.discard(XMLInstr) + if extension == 'AES' and not cpuid.get_bit(ecx1, 25): instrSet.discard(XMLInstr) + if extension == 'AVX': + if not cpuid.get_bit(ecx1, 28): + instrSet.discard(XMLInstr) + else: + global supportsAVX + supportsAVX = True + if extension == 'AVXAES' and not (cpuid.get_bit(ecx1, 25) and cpuid.get_bit(ecx1, 28)): instrSet.discard(XMLInstr) + if extension == 'F16C' and not cpuid.get_bit(ecx1, 29): instrSet.discard(XMLInstr) + if extension == 'RDRAND' and not cpuid.get_bit(ecx1, 30): instrSet.discard(XMLInstr) + if extension == 'MMX' and not cpuid.get_bit(edx1, 23): instrSet.discard(XMLInstr) + if extension == 'SSE' and not cpuid.get_bit(edx1, 25): instrSet.discard(XMLInstr) + if extension == 'SSE2' and not cpuid.get_bit(edx1, 26): instrSet.discard(XMLInstr) + if extension == 'BMI1' and not cpuid.get_bit(ebx7, 3): instrSet.discard(XMLInstr) + if extension in ['AVX2', 'AVX2GATHER'] and not cpuid.get_bit(ebx7, 5): instrSet.discard(XMLInstr) + if extension == 'BMI2' and not cpuid.get_bit(ebx7, 8): instrSet.discard(XMLInstr) + if extension == 'WBNOINVD' and not cpuid.get_bit(ebx7, 9): instrSet.discard(XMLInstr) + if extension == 'MPX' and not cpuid.get_bit(ebx7, 14): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512F') and not cpuid.get_bit(ebx7, 16): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512DQ') and not cpuid.get_bit(ebx7, 16): instrSet.discard(XMLInstr) + if extension == 'RDSEED' and not cpuid.get_bit(ebx7, 18): instrSet.discard(XMLInstr) + if extension == 'ADOX_ADCX' and not cpuid.get_bit(ebx7, 19): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512_IFMA') and not cpuid.get_bit(ebx7, 21): instrSet.discard(XMLInstr) + if extension == 'CLFLUSHOPT' and not cpuid.get_bit(ebx7, 23): instrSet.discard(XMLInstr) + if extension == 'CLWB' and not cpuid.get_bit(ebx7, 24): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512PF') and not cpuid.get_bit(ebx7, 26): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512ER') and not cpuid.get_bit(ebx7, 27): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512CD') and not cpuid.get_bit(ebx7, 28): instrSet.discard(XMLInstr) + if extension == 'SHA' and not cpuid.get_bit(ebx7, 29): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512BW') and not cpuid.get_bit(ebx7, 30): instrSet.discard(XMLInstr) + if extension == 'PREFETCHWT1' and not cpuid.get_bit(ecx7, 0): instrSet.discard(XMLInstr) + if category == 'AVX512_VBMI' and not cpuid.get_bit(ecx7, 1): instrSet.discard(XMLInstr) + if extension == 'PKU' and not cpuid.get_bit(ecx7, 4): instrSet.discard(XMLInstr) + if extension == 'WAITPKG' and not cpuid.get_bit(ecx7, 5): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512_VBMI2') and not cpuid.get_bit(ecx7, 6): instrSet.discard(XMLInstr) + if category == 'GFNI' and not cpuid.get_bit(ecx7, 8): instrSet.discard(XMLInstr) + if 'VAES' in isaSet and not cpuid.get_bit(ecx7, 9): instrSet.discard(XMLInstr) + if 'VPCLMULQDQ' in isaSet and not cpuid.get_bit(ecx7, 10): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512_VNNI') and not cpuid.get_bit(ecx7, 11): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512_BITALG') and not cpuid.get_bit(ecx7, 12): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512_VPOPCNTDQ') and not cpuid.get_bit(ecx7, 14): instrSet.discard(XMLInstr) + if extension == 'RDPID' and not cpuid.get_bit(ecx7, 22): instrSet.discard(XMLInstr) + if extension == 'CLDEMOTE' and not cpuid.get_bit(ecx7, 25): instrSet.discard(XMLInstr) + if iclass == 'MOVDIRI' and not cpuid.get_bit(ecx7, 27): instrSet.discard(XMLInstr) + if iclass == 'MOVDIR64B' and not cpuid.get_bit(ecx7, 28): instrSet.discard(XMLInstr) + if extension == 'ENQCMD' and not cpuid.get_bit(ecx7, 29): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512_4VNNI') and not cpuid.get_bit(edx7, 2): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512_4FMAPS') and not cpuid.get_bit(edx7, 3): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512_VP2INTERSECT') and not cpuid.get_bit(edx7, 8): instrSet.discard(XMLInstr) + if extension == 'PCONFIG' and not cpuid.get_bit(edx7, 18): instrSet.discard(XMLInstr) + if isaSet.startswith('AVX512_BF16') and not cpuid.get_bit(eax7_1, 5): instrSet.discard(XMLInstr) + if extension == 'SSE4a' and not cpuid.get_bit(ecx8_1, 6): instrSet.discard(XMLInstr) + if extension == 'XOP' and not cpuid.get_bit(ecx8_1, 11): instrSet.discard(XMLInstr) + if extension == 'FMA4' and not cpuid.get_bit(ecx8_1, 16): instrSet.discard(XMLInstr) + if extension == 'TBM' and not cpuid.get_bit(ecx8_1, 21): instrSet.discard(XMLInstr) + if extension == 'RDTSCP' and not cpuid.get_bit(edx8_1, 27): instrSet.discard(XMLInstr) + if extension == '3DNOW' and not cpuid.get_bit(edx8_1, 31): instrSet.discard(XMLInstr) + if extension in ['CLZERO']and not cpuid.get_bit(ebx8_8, 0): instrSet.discard(XMLInstr) + + # Virtualization instructions + if extension in ['SVM', 'VMFUNC', 'VTX']: instrSet.discard(XMLInstr) + + # Safer Mode Extensions + if extension in ['SMX']: instrSet.discard(XMLInstr) + + # Software Guard Extensions + if extension in ['SGX', 'SGX_ENCLV']: instrSet.discard(XMLInstr) + + # Transactional Synchronization Extensions + if extension in ['RTM']: instrSet.discard(XMLInstr) + + # X87 instructions: + if extension in ['X87']: instrSet.discard(XMLInstr) + if XMLInstr.attrib['category'] in ['X87_ALU']: instrSet.discard(XMLInstr) + + # System instructions + if extension in ['INVPCID', 'MONITOR', 'MONITORX', 'RDWRFSGS', 'SMAP', 'XSAVE', 'XSAVEC', 'XSAVEOPT', 'XSAVES']: instrSet.discard(XMLInstr) + if XMLInstr.attrib['category'] in ['INTERRUPT', 'SEGOP', 'SYSCALL', 'SYSRET']: instrSet.discard(XMLInstr) + if XMLInstr.attrib['iclass'] in ['CALL_FAR', 'HLT', 'INVD', 'IRET', 'IRETD', 'IRETQ', 'JMP_FAR', 'LTR', 'RET_FAR', 'UD2']: + instrSet.discard(XMLInstr) + if XMLInstr.attrib['iform'] in ['POP_FS', 'POP_GS', 'MOV_CR_CR_GPR64', 'MOV_SEG_MEMw', 'MOV_SEG_GPR16', 'SWAPGS']: instrSet.discard(XMLInstr) + + # Undefined instructions + if XMLInstr.attrib['iclass'].startswith('UD'): instrSet.discard(XMLInstr) + + return list(instrSet) + + +def main(): + parser = argparse.ArgumentParser(description='CPU Benchmarks') + parser.add_argument("-iaca", help="IACA command line; if not specified, perf. ctrs. are used") + parser.add_argument("-input", help="Instructions XML file", required=True) + parser.add_argument("-output", help="Output XML file", default='result.xml') + parser.add_argument("-arch", help="Architecture, Supported: [NHM, ...]") + parser.add_argument("-noPretty", help="Disable pretty printing XML file", action='store_true') + parser.add_argument("-noPorts", help="Don't measure port usage", action='store_true') + parser.add_argument("-tpInput", help=".pickle file with TP data") + parser.add_argument("-latInput", help=".pickle file with latency data") + parser.add_argument("-debug", help="Debug output", action='store_true') + + args = parser.parse_args() + + global arch + if args.arch is not None: + arch = args.arch + else: + cpu = cpuid.CPUID() + arch = cpuid.micro_arch(cpu) + print cpuid.get_basic_info(cpu) + if arch == 'unknown': + exit(1) + + global debugOutput + debugOutput = args.debug + + global useIACA + if args.iaca: + useIACA = True + + try: + versionString = subprocess.check_output([args.iaca], stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + versionString = e.output + global iacaVersion + iacaVersion = re.search('\d\.\d', versionString).group(0) + global iacaCMDLine + iacaCMDLine = [args.iaca, '-reduceout', '-arch', arch] + if iacaVersion == '2.1': + iacaCMDLine.append('-64') + else: + useIACA = False + + resetNanoBench() + + if arch in ['ZEN+', 'ZEN2']: + configurePFCs(['UOPS','FpuPipeAssignment.Total0', 'FpuPipeAssignment.Total1', 'FpuPipeAssignment.Total2', 'FpuPipeAssignment.Total3', 'DIV_CYCLES']) + else: + configurePFCs(['UOPS', 'RETIRE_SLOTS', 'UOPS_PORT0', 'UOPS_PORT1', 'UOPS_PORT2', 'UOPS_PORT3', 'UOPS_PORT4', 'UOPS_PORT5', 'UOPS_PORT6', 'UOPS_PORT7', + 'UOPS_PORT23', 'UOPS_PORT49', 'UOPS_PORT78', 'DIV_CYCLES', 'ILD_STALL.LCP', 'INST_DECODED.DEC0']) + + try: + subprocess.check_output('mkdir -p /tmp/ramdisk; sudo mount -t tmpfs -o size=100M none /tmp/ramdisk/', shell=True) + except subprocess.CalledProcessError as e: + print "Could not create ramdisk " + e.output + exit(1) + + XMLRoot = ET.parse(args.input).getroot() + XMLRoot.attrib['date'] = str(datetime.date.today()) + + global instrNodeList + instrNodeList = filterInstructions(XMLRoot) + + global instrNodeDict + instrNodeDict = {instrNode.attrib['string']: instrNode for instrNode in instrNodeList} + + # move instructions that need a preInstr to the end, as their throughput can only be determined after the throughput of the instructions included in the + # preInstr has been measured + instrRequiringPreInstr = [] + if not useIACA: + instrRequiringPreInstr = [x for x in instrNodeList if 'DIV' in x.attrib['iclass'] or 'SQRT' in x.attrib['iclass'] or getPreInstr(x)[0]] + instrNodeList.sort(key=lambda x: (x in instrRequiringPreInstr, x.attrib['string'])) + + for instrNode in instrNodeList: + archNode = instrNode.find('./architecture[@name="' + arch + '"]') + if archNode is None: + archNode = SubElement(instrNode, "architecture") + archNode.attrib['name'] = arch + if not useIACA: + measurementNode = archNode.find('./measurement') + if measurementNode is None: + measurementNode = SubElement(archNode, "measurement") + + ######################## + # Througput and Uops + ######################## + + tpDict = {} + tpDictSameReg = {} + tpDictNoInteriteration = {} + + if args.tpInput is not None: + with open(args.tpInput, 'rb') as f: + pTpDict, pTpDictSameReg, pTpDictNoInteriteration = pickle.load(f) + tpDict = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDict.items()} + tpDictSameReg = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictSameReg.items()} + tpDictNoInteriteration = {instrNodeDict[k.attrib['string']]:v for k,v in pTpDictNoInteriteration.items()} + else: + for i, instrNode in enumerate(instrNodeList): + print 'Measuring throughput for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')' + + htmlReports = ['

    ' + instrNode.attrib['string'] + ' - Throughput and Uops' + (' (IACA '+iacaVersion+')' if useIACA else '') + '

    \n
    \n'] + + hasCommonReg = hasCommonRegister(instrNode) + if hasCommonReg: htmlReports.append('

    With different registers for different operands

    \n') + + tpResult = getThroughputAndUops(instrNode, True, htmlReports) + print instrNode.attrib['string'] + " - tp: " + str(tpResult) + + if tpResult: + tpDict[instrNode] = tpResult + + if hasCommonReg: + htmlReports.append('

    With the same register for for different operands

    \n') + tpResultSameReg = getThroughputAndUops(instrNode, False, htmlReports) + if tpResultSameReg: + tpDictSameReg[instrNode] = tpResultSameReg + + if useIACA and iacaVersion in ['2.1', '2.2']: + htmlReports.append('

    With the -no_interiteration flag

    \n') + tp = getThroughputIacaNoInteriteration(instrNode, htmlReports) + if tp: tpDictNoInteriteration[instrNode] = tp + + if tpResult: writeHtmlFile('html-tp/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports)) + with open('tp_' + arch + '.pickle', 'wb') as f: + pickle.dump((tpDict, tpDictSameReg, tpDictNoInteriteration), f) + + num_ports = len(tpDict.values()[0].unblocked_ports) + + ######################## + # Latency + ######################## + + if not useIACA: + configurePFCs(['UOPS']) + getBasicLatencies(instrNodeList) + + latencyDict = {} + + if args.latInput is not None: + with open(args.latInput, 'rb') as f: + latencyDict = {instrNodeDict[k.attrib['string']]:v for k,v in pickle.load(f).items()} + elif not useIACA or iacaVersion == '2.1': + for i, instrNode in enumerate(instrNodeList): + print 'Measuring latencies for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')' + + htmlReports = ['

    ' + instrNode.attrib['string'] + ' - Latency' + (' (IACA '+iacaVersion+')' if useIACA else '') + '

    \n
    \n'] + lat = getLatencies(instrNode, instrNodeList, tpDict, htmlReports) + + if lat is not None: + if debugOutput: print instrNode.attrib['iform'] + ': ' + str(lat) + latencyDict[instrNode] = lat + writeHtmlFile('html-lat/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports)) + with open('lat_' + arch + '.pickle', 'wb') as f: + pickle.dump(latencyDict, f) + + ######################## + # Ports + ######################## + + if not useIACA: + configurePFCs(['UOPS']) + + # the elements of this set are sets of ports that either have the same functional units, or that cannot be used independently + portCombinationsResultDict = {} + portCombinationsResultDictSameReg = {} + + if not args.noPorts: + # iforms of instructions that are potentially zero-latency instructions + # we consider all MOVZX instructions to be potentially zero-latency instr.; the descr. in the manual is not accurate as, e.g., MOVZX RSI, CL can be + # eliminated, but MOVZX RSI, DIL cannot (at least on Coffee Lake) + zeroLatencyMovIforms = set(x.attrib['iform'] for x in instrNodeList + if x.attrib['iform'].startswith(('MOV_', 'MOVZX_', 'NOP', 'MOVUPD_', 'MOVAPD_', 'MOVUPS_', 'MOVAPS_', 'MOVDQA_', 'MOVDQU_', + 'VMOVUPD_', 'VMOVAPD_', 'VMOVUPS_', 'VMOVAPS_', 'VMOVDQA_', 'VMOVDQU_')) + and len(x.findall('./operand[@type="reg"]')) >= 2 and not 'MEM' in x.attrib['iform']) + # iforms of instructions that change the control flow based on a register, flag, or memory location + controlFlowInstrs = set(instr for instr in instrNodeList for op in instr.findall('./operand[@type="reg"]') if op.text == 'RIP') + disallowedBlockingInstrs = set(instr for instr in tpDict + if instr.attrib['iform'] in (zeroLatencyMovIforms | serializingInstructions | set(['PAUSE'])) + or (instr in controlFlowInstrs and not instr.attrib['iform'] == 'JMP_RELBRb') + or (instr.find('./operand[@base="RSP"]') is not None) + or (instr.find('./operand[@conditionalWrite="1"]') is not None) + or instr.attrib['category'] == 'SYSTEM' + or instr.attrib['extension'] == 'X87' + or '_AL_' in instr.attrib['iform'] or '_OrAX_' in instr.attrib['iform'] + or tpDict[instr].TP_noDepBreaking_noLoop - .2 > max([uops for _, uops in tpDict[instr].unblocked_ports.items()] or [0]) + or '512' in instr.attrib['isa-set']) # on SKX, some AVX-512 instructions can 'shut down' vector units on port 1 + if isAMDCPU(): + disallowedBlockingInstrs |= set(instr for instr in instrNodeList for op in instr.findall('./operand[@type="mem"]')) + # combining SHA instr. with other instr. leads to wrong port counts + disallowedBlockingInstrs |= set(instr for instr in instrNodeList if instr.attrib['extension'] == 'SHA') + # combining FP with non-FP instr. can lead to wrong port counts + disallowedBlockingInstrs |= set(instr for instr in instrNodeList if instr.attrib['category'] in ['LOGICAL_FP'] or + any(not 'f' in o.attrib.get('xtype','f') for o in instr.findall('./operand'))) + + print 'disallowedBlockingInstrs' + for instrNode in disallowedBlockingInstrs: + print ' ' + str(instrNode.attrib['string']) + + print 'tpDict' + for instr, tpResult in tpDict.items(): + print ' ' + str(instr.attrib['string']) + ' ' + str(tpResult.unblocked_ports) + + # we cannot start higher than .79 as IACA has .2 uops on each port for a port usage of, e.g., 1*p1256 + # using uops_dict instead can be problematic because in IACA the uops on the individual ports do not always add up to this value + oneUopInstrs = [instr for instr, tpResult in tpDict.items() if instr not in disallowedBlockingInstrs and .79 < sum([v for v in tpResult.unblocked_ports.values() if v>.1]) < 1.11] + + print 'oneUopInstrs' + for instrNode in oneUopInstrs: + print ' ' + str(instrNode.attrib['string']) + # dicts from port combination to a set of instructions (either not containing AVX or SSE instructions bec. of transition penalty) that always uses these ports + blockingInstructionsDictNonAVX_set = {} + blockingInstructionsDictNonSSE_set = {} + + for instrNode in oneUopInstrs: + usedPorts = frozenset({p for p, x in tpDict[instrNode].unblocked_ports.items() if x>0.1}) + if usedPorts: + print instrNode.attrib['iform'] + ': ' + str(usedPorts) + ' ' + str(len(instrNode.findall('./operand[@suppressed="1"]'))) + + if not isSSEInstr(instrNode): + if not usedPorts in blockingInstructionsDictNonSSE_set: blockingInstructionsDictNonSSE_set[usedPorts] = set() + blockingInstructionsDictNonSSE_set[usedPorts].add(instrNode) + if not isAVXInstr(instrNode): + if not usedPorts in blockingInstructionsDictNonAVX_set: blockingInstructionsDictNonAVX_set[usedPorts] = set() + blockingInstructionsDictNonAVX_set[usedPorts].add(instrNode) + + # choose instruction with lowest throughput value; prefer non-control flow instructions, instr. that do not need decoder 0, and instr. with as few as + # possible implicit operands that are read + sort_key = lambda x:(x in controlFlowInstrs, tpDict[x].dec0, len(x.findall('./operand[@suppressed="1"]')), tpDict[x].TP_noDepBreaking_noLoop, x.attrib['string']) + blockingInstructionsDictNonAVX = {comb: next(iter(sorted(instr_set, key=sort_key))) for comb, instr_set in blockingInstructionsDictNonAVX_set.items()} + blockingInstructionsDictNonSSE = {comb: next(iter(sorted(instr_set, key=sort_key))) for comb, instr_set in blockingInstructionsDictNonSSE_set.items()} + + for comb, instr_set in blockingInstructionsDictNonAVX_set.items(): + print comb + print [x.attrib['string'] for x in sorted(instr_set, key=sort_key)] + + print str(blockingInstructionsDictNonAVX.items()) + + # mov to mem has always two uops: store address and store data; there is no instruction that uses just one of them + movMemInstrNode = instrNodeDict['MOV (M64, R64)'] + + if arch in ['ICL']: + storeDataPort = 49 + else: + storeDataPort = 4 + blockingInstructionsDictNonAVX[frozenset({storeDataPort})] = movMemInstrNode + blockingInstructionsDictNonSSE[frozenset({storeDataPort})] = movMemInstrNode + + storeAddressPorts = frozenset({p for p, x in tpDict[movMemInstrNode].unblocked_ports.items() if x>=0.1 and not p == storeDataPort}) + if storeAddressPorts not in blockingInstructionsDictNonAVX: blockingInstructionsDictNonAVX[storeAddressPorts] = movMemInstrNode + if storeAddressPorts not in blockingInstructionsDictNonSSE: blockingInstructionsDictNonSSE[storeAddressPorts] = movMemInstrNode + + print 'Non-AVX:' + for k,v in blockingInstructionsDictNonAVX.items(): + print str(k) + ': ' + v.attrib['iform'] + print 'Non-SSE:' + for k,v in blockingInstructionsDictNonSSE.items(): + print str(k) + ': ' + v.attrib['iform'] + + sortedPortCombinationsNonAVX = sorted(blockingInstructionsDictNonAVX.keys(), key=lambda x:(len(x), sorted(x))) + sortedPortCombinationsNonSSE = sorted(blockingInstructionsDictNonSSE.keys(), key=lambda x:(len(x), sorted(x))) + print 'sortedPortCombinations: ' + str(sortedPortCombinationsNonAVX) + + for i, instrNode in enumerate(instrNodeList): + if not instrNode in tpDict: + # don't iterate over the keys of unblocked_ports_dict directly because of the ordering + continue + + print 'Measuring port usage for ' + instrNode.attrib['string'] + ' (' + str(i) + '/' + str(len(instrNodeList)) + ')' + + htmlReports = ['

    ' + instrNode.attrib['string'] + ' - Port Usage' + (' (IACA '+iacaVersion+')' if useIACA else '') + '

    '] + + for useDistinctRegs in ([True, False] if instrNode in tpDictSameReg else [True]): + + tpResult = None + + if useDistinctRegs: + tpResult = tpDict[instrNode] + else: + if tpDict[instrNode].uops == tpDictSameReg[instrNode].uops: continue + tpResult = tpDictSameReg[instrNode] + htmlReports.append('

    With the same register for different operands

    ') + + rem_uops = max(tpResult.uops, int(sum(x for p, x in tpResult.unblocked_ports.items() if x>0) + .2)) + + if not useIACA and tpResult.config.preInstrNodes: + rem_uops -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops for preInstrNode in tpResult.config.preInstrNodes) + + # use abs because on, e.g., IVB port usages might be smaller in the second half of the experiments if replays happen + used_ports = {p for p, x in tpResult.unblocked_ports.items() if abs(x)>0.05} + if debugOutput: print instrNode.attrib['string'] + ' - used ports: ' + str(used_ports) + ', dict: ' + str(tpResult.unblocked_ports) + + if not isAVXInstr(instrNode): + blockingInstrs = blockingInstructionsDictNonAVX + sortedPortCombinations = sortedPortCombinationsNonAVX + else: + blockingInstrs = blockingInstructionsDictNonSSE + sortedPortCombinations = sortedPortCombinationsNonSSE + + uopsCombinationList = [] + + if not used_ports: + htmlReports.append('No uops') + elif (rem_uops == 1) and (not tpResult.config.preInstrNodes) and (not tpResult.ILD_stalls > 0): + # one uop instruction + uopsCombinationList = [(frozenset(used_ports), 1)] + htmlReports.append('
    Port usage: 1*' + ('p' if isIntelCPU() else 'FP') + ''.join(str(p) for p in used_ports)) + elif rem_uops > 0 and not isAMDCPU(): + for combination in sortedPortCombinations: + if not combination.intersection(used_ports): continue + + prevUopsOnCombination = 0 + for prev_combination, prev_uops in uopsCombinationList: + if prev_combination.issubset(combination): + prevUopsOnCombination += prev_uops + + if not useIACA: + if tpResult.config.preInstrNodes: + for preInstrNode in tpResult.config.preInstrNodes: + for pre_comb, pre_uops in portCombinationsResultDict[instrNodeDict[preInstrNode.attrib['string']]]: + if pre_comb.issubset(combination): + prevUopsOnCombination += pre_uops + + nPortsInComb = sum(len(str(x)) for x in combination) + blockInstrRep = max(2 * nPortsInComb * max(1,int(tpDict[instrNode].TP_single)), nPortsInComb * tpDict[instrNode].uops, 10) + blockInstrRep = min(blockInstrRep, 100) + uopsOnBlockedPorts = getUopsOnBlockedPorts(instrNode, useDistinctRegs, blockingInstrs[combination], blockInstrRep, combination, tpResult.config, htmlReports) + if uopsOnBlockedPorts is None: + print 'no uops on blocked ports: ' + str(combination) + continue + + uopsOnBlockedPorts -= prevUopsOnCombination + + if rem_uops < uopsOnBlockedPorts: + print 'More uops on ports than total uops, combination: ' + str(combination) + ', ' + str(uopsOnBlockedPorts) + + if uopsOnBlockedPorts <= 0: continue + + if combination == {storeDataPort} and instrNode.attrib.get('locked', '') == '1': + # for instructions with a lock prefix, the blocking instrs don't seem to be sufficient for actually blocking the store data port, which + # seems to lead to replays of the store data uops + uopsOnBlockedPorts = 1 + + uopsCombinationList.append((combination, uopsOnBlockedPorts)) + + htmlReports.append('⇨ ' + + ((str(uopsOnBlockedPorts) + ' μops') if (uopsOnBlockedPorts > 1) else 'One μop') + + ' that can only use port' + + ('s {' if len(combination)>1 else ' ') + + str(list(combination))[1:-1] + + ('}' if len(combination)>1 else '') + '') + + rem_uops -= uopsOnBlockedPorts + if rem_uops <= 0: break + + # on ICL, some combinations (e.g. {4,9}) are treated as one port (49) above, as there is only a single counter for both ports + # we split these combinations now, as, e.g., the call to getTP_LP requires them to be separate + uopsCombinationList = [(frozenset(''.join(map(str,comb))), uops) for comb, uops in uopsCombinationList] + + if useDistinctRegs: + portCombinationsResultDict[instrNode] = uopsCombinationList + else: + portCombinationsResultDictSameReg[instrNode] = uopsCombinationList + + writeHtmlFile('html-ports/'+arch, instrNode, instrNode.attrib['string'], ''.join(htmlReports)) + + + ######################## + # Write XML File + ######################## + + for instrNode in tpDict: + archNode = instrNode.find('./architecture[@name="' + arch + '"]') + if useIACA: + resultNode = SubElement(archNode, "IACA") + resultNode.attrib['version'] = iacaVersion + else: + resultNode = archNode.find('./measurement') + + tpResult_dr = tpDict[instrNode] + tpResult_sr = tpDictSameReg.get(instrNode, tpResult_dr) + + for tpResult in ([tpResult_dr, tpResult_sr] if tpResult_dr.uops != tpResult_sr.uops else [tpResult_dr]): + suffix = ('' if tpResult == tpResult_dr else '_same_reg') + curPortCombinationsResultDict = (portCombinationsResultDict if tpResult == tpResult_dr else portCombinationsResultDictSameReg) + + uops = tpResult.uops + uopsFused = tpResult.fused_uops + if useIACA: + if uopsFused: + resultNode.attrib['fusion_occurred'] = '1' + else: + if tpResult.config.preInstrNodes: + uops -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].uops for preInstrNode in tpResult.config.preInstrNodes) + uopsFused -= sum(tpDict[instrNodeDict[preInstrNode.attrib['string']]].fused_uops for preInstrNode in tpResult.config.preInstrNodes) + if uopsFused > 0: + resultNode.attrib['uops_retire_slots'+suffix] = str(uopsFused) + resultNode.attrib['uops'+suffix] = str(uops) + + if useIACA and instrNode in latencyDict: + resultNode.attrib['latency'] = str(latencyDict[instrNode]) + + resultNode.attrib['TP'+suffix] = "%.2f" % tpResult.TP + if instrNode in tpDictNoInteriteration: + resultNode.attrib['TP_no_interiteration'] = "%.2f" % tpDictNoInteriteration[instrNode] + + divCycles = tpResult.divCycles + if divCycles: resultNode.attrib['div_cycles'+suffix] = str(divCycles) + + portPrefix = ('p' if isIntelCPU() else 'FP') + computePortStr = lambda lst: '+'.join(str(uops)+'*'+portPrefix+''.join(str(p) for p in sorted(c)) for c, uops in sorted(lst, key=lambda x: sorted(x[0]))) + if instrNode in curPortCombinationsResultDict: + portUsageList = curPortCombinationsResultDict[instrNode] + if not portUsageList: continue + + resultNode.attrib['ports'+suffix] = computePortStr(portUsageList) + + portUsageWithDivList = list(portUsageList) + if divCycles: + portUsageWithDivList.append((frozenset(['div']), divCycles)) + resultNode.attrib['TP_ports'+suffix] = "%.2f" % getTP_LP(portUsageWithDivList) + + with open(args.output, "w") as f: + reparsed = XMLRoot + if not args.noPretty: + rough_string = ET.tostring(XMLRoot, 'utf-8') + reparsed = minidom.parseString(rough_string) + f.write('\n'.join([line for line in reparsed.toprettyxml(indent=' ').split('\n') if line.strip()])) + + with tarfile.open('genhtml-' + arch + (('-IACA' + iacaVersion) if useIACA else '-Measurements') + '.tar.gz', "w:gz") as tar: + tar.add('/tmp/cpu-html/', arcname=os.path.sep) + + shutil.rmtree('/tmp/cpu-html/') + + try: + subprocess.check_output('umount /tmp/ramdisk/', shell=True) + except subprocess.CalledProcessError: + exit(1) + + print 'Total number of microbenchmarks: ' + str(nExperiments) + + +if __name__ == "__main__": + main() diff --git a/tools/cpuBench/mergeXML.py b/tools/cpuBench/mergeXML.py new file mode 100755 index 0000000..a46f120 --- /dev/null +++ b/tools/cpuBench/mergeXML.py @@ -0,0 +1,42 @@ +#!/usr/bin/python +import xml.etree.ElementTree as ET +from xml.dom import minidom +import argparse +import datetime + +# If inp2 contains a measurement node for an architecture for which inp1 does not contain a measurement node, the node is added to a copy of inp1. +def main(): + parser = argparse.ArgumentParser(description='Merge XML files') + parser.add_argument('inp1') + parser.add_argument('inp2') + parser.add_argument('outp') + args = parser.parse_args() + + root1 = ET.parse(args.inp1).getroot() + root2 = ET.parse(args.inp2).getroot() + instrNode2Dict = {instrNode.attrib['string']: instrNode for instrNode in root2.iter('instruction')} + + root1.attrib['date'] = str(datetime.date.today()) + + for instrNode1 in root1.iter('instruction'): + if instrNode1.attrib['string'] not in instrNode2Dict: + print 'no matching entry found for ' + instrNode1.attrib['string'] + continue + for instrNode2 in instrNode2Dict[instrNode1.attrib['string']]: + for archNode2 in instrNode2.iter('architecture'): + archNode1 = instrNode1.find('./architecture[@name="' + archNode2.attrib['name'] + '"]') + if archNode1 is not None: + if archNode1.findall('./measurement'): continue + for measurementNode in archNode2.findall('./measurement'): + archNode1.append(measurementNode) + else: + instrNode1.append(archNode2) + + with open(args.outp, "w") as f: + rough_string = ET.tostring(root1, 'utf-8') + reparsed = minidom.parseString(rough_string) + f.write('\n'.join([line for line in reparsed.toprettyxml(indent=' '*2).split('\n') if line.strip()])) + + +if __name__ == "__main__": + main() diff --git a/tools/cpuBench/utils.py b/tools/cpuBench/utils.py new file mode 100755 index 0000000..1f3ef51 --- /dev/null +++ b/tools/cpuBench/utils.py @@ -0,0 +1,198 @@ +import re +import sys +from scipy.optimize import linprog + +def addHTMLCodeForOperands(instrNode, html): + if instrNode.find('operand') is not None: + html.append('

    Operands

    ') + html.append('') + +def canonicalizeInstrString(instrString): + return re.sub('[(){}, ]+', '_', instrString).strip('_') + +def getTP_LP(PU): + if len(PU) == 0: + return 0 + + if len(PU) == 1: + pc, uops = PU[0] + return round(float(uops)/len(pc), 2) + + ports = list(set.union(*[set(pc) for pc, _ in PU])) + + zeroConstraint = [] + for p in ports: + for pc, uops in PU: + if not p in pc: + zeroConstraint.append(1) + else: + zeroConstraint.append(0) + zeroConstraint.append(0) #z + + nonZeroConstraints = [] + nonZeroConstraintsRHS = [] + for pu in PU: + pc, uops = pu + nonZeroConstraintsRHS.append(uops) + nonZeroConstraint = [] + for p in ports: + for pu2 in PU: + if pu != pu2 or p not in pc: + nonZeroConstraint.append(0) + else: + nonZeroConstraint.append(1) + nonZeroConstraint.append(0) #z + nonZeroConstraints.append(nonZeroConstraint) + + A_eq = [zeroConstraint] + nonZeroConstraints + b_eq = [0] + nonZeroConstraintsRHS + + zConstraints = [] + for p in ports: + zConstraint = [] + for p2 in ports: + for pu in PU: + if p != p2: + zConstraint.append(0) + else: + zConstraint.append(1) + zConstraint.append(-1) + zConstraints.append(zConstraint) + + A_ub = zConstraints + b_ub = [0] * len(zConstraints) + + c = [0]*(len(PU)*len(ports)) + [1] + + res = linprog(c, A_ub=A_ub, b_ub=b_ub, A_eq=A_eq, b_eq=b_eq) + return round(res.fun, 2) + + +# Example output: "Latency operand 2 -> 1 (memory): <=3" +def latencyNodeToStr(latNode, sameReg, addr_mem): + suffix = ('_'+addr_mem if addr_mem else '') + ('_same_reg' if sameReg else '') + if not any((a in ['cycles'+suffix, 'min_cycles'+suffix]) for a in latNode.attrib): + return None + + ret = 'Latency operand ' + latNode.attrib['start_op'] + ' → ' + latNode.attrib['target_op'] + if sameReg: + ret += ', with the same register for different operands' + if addr_mem == 'addr': + ret += ' (address, base register)' + elif addr_mem == 'addr_VSIB': + ret += ' (address, index register)' + elif addr_mem == 'mem': + ret += ' (memory)' + ret += ': ' + + if 'cycles'+suffix in latNode.attrib: + if latNode.attrib.get('cycles'+suffix+'_is_upper_bound', '') == '1': + ret += '≤' + cycles = latNode.attrib['cycles'+suffix] + ret += cycles + else: + minCycles = latNode.attrib['min_cycles'+suffix] + maxCycles = latNode.attrib['max_cycles'+suffix] + + if latNode.attrib.get('min_cycles'+suffix+'_is_upper_bound', '') == '1': + ret += '≤' + minCycles + else: + ret += minCycles + ' ≤ lat ≤ ' + maxCycles + + return ret + +# Returns (string, minLat, minLatUB, maxLat, maxLatUB) +# Example output: ("[1;<=7]", 1, False, 7, True) +def getLatencyTableEntry(measurementNode): + if measurementNode is None or measurementNode.find('./latency') is None: + return None + + minLat = sys.maxint + maxLat = 0 + minLatUB = False + maxLatUB = False + + for latNode in measurementNode.findall('./latency'): + for sameReg in [False, True]: + for addr_mem in ['', 'addr', 'mem']: + suffix = ('_'+addr_mem if addr_mem else '') + ('_same_reg' if sameReg else '') + if 'cycles'+suffix in latNode.attrib: + cycles = int(latNode.attrib['cycles'+suffix]) + isUB = (latNode.attrib.get('cycles'+suffix+'_is_upper_bound', '') == '1') + + if cycles == maxLat: + maxLatUB = (maxLatUB and isUB) + elif cycles > maxLat: + maxLat = cycles + maxLatUB = isUB + + if cycles == minLat: + minLatUB = (minLatUB or isUB) + elif cycles < minLat: + minLat = cycles + minLatUB = isUB + + if 'max_cycles'+suffix in latNode.attrib: + cycles = int(latNode.attrib['max_cycles'+suffix]) + isUB = (latNode.attrib.get('max_cycles'+suffix+'_is_upper_bound', '') == '1') + if cycles == maxLat: + maxLatUB = (maxLatUB and isUB) + elif cycles > maxLat: + maxLat = cycles + maxLatUB = isUB + + if 'min_cycles'+suffix in latNode.attrib: + cycles = float(latNode.attrib['min_cycles'+suffix]) + isUB = (latNode.attrib.get('min_cycles'+suffix+'_is_upper_bound', '') == '1') + if cycles == minLat: + minLatUB = (minLatUB or isUB) + elif cycles < minLat: + minLat = cycles + minLatUB = isUB + + if minLat == maxLat: + latStr = str(maxLat) + if minLatUB or maxLatUB: + latStr = '≤' + latStr + else: + latStr = '[' + if minLatUB: + latStr += '≤' + latStr += str(minLat) + latStr += ';' + if maxLatUB: + latStr += '≤' + latStr += str(maxLat) + latStr += ']' + + return (latStr, minLat, minLatUB, maxLat, maxLatUB)