diff --git a/Params.py b/Params.py new file mode 100755 index 0000000..b688ca0 --- /dev/null +++ b/Params.py @@ -0,0 +1,103 @@ +#!/apps/python/3.5-anaconda/bin/python +class Parameter(object): + type_list = ["REG", "MEM", "IMD", "LBL", "NONE"] + def __init__(self, ptype, name=""): + self.ptype = ptype.upper() + if(self.ptype not in self.type_list): + raise NameError("Type not supported: "+ptype) + + def print(self): + if(self.ptype == "NONE"): + return "" + else: + return self.ptype + +class MemAddr(Parameter): + segment_regs = ["CS", "DS", "SS", "ES", "FS", "GS"] + scales = [1, 2, 4, 8] + def __init__(self, name): + self.sreg = False + self.offset = False + self.base = False + self.index = False + self.scale = False + if(':' in name): + if(name[1:name.index(':')].upper() not in self.segment_regs): + raise NameError("Type not supported: "+name) + self.sreg = True + self.offset = True + if('(' not in name or ('(' in name and name.index('(') != 0)): + self.offset = True + if('(' in name): + self.parentheses = name[name.index('(')+1:-1] + self.commacnt = self.parentheses.count(',') + if(self.commacnt == 0): + self.base = True + elif(self.commacnt == 2 and int(self.parentheses[-1:]) in self.scales): + self.base = True + self.index = True + self.scale = True + else: + raise NameError("Type not supported: "+name) + + def print(self): + self.mem_format = "MEM(" + if(self.sreg): + self.mem_format += "sreg:" + if(self.offset): + self.mem_format += "offset" + if(self.base and not self.index): + self.mem_format += "(base)" + elif(self.base and self.index and self.scale): + self.mem_format += "(base, index, scale)" + self.mem_format += ")" + return self.mem_format + + + +class Register(Parameter): + sizes = { +#General Purpose Registers + "AH":(8,"GPR"), "AL":(8,"GPR"), "BH":(8,"GPR"), "BL":(8,"GPR"), "CH":(8,"GPR"), "CL":(8,"GPR"), "DH":(8,"GPR"), "DL":(8,"GPR"), "BPL":(8,"GPR"), "SIL":(8,"GPR"), "DIL":(8,"GPR"), "SPL":(8,"GPR"), "R8L":(8,"GPR"), "R9L":(8,"GPR"), "R10L":(8,"GPR"), "R11L":(8,"GPR"), "R12L":(8,"GPR"), "R13L":(8,"GPR"), "R14L":(8,"GPR"), "R15L":(8,"GPR"), + "R8B":(8,"GPR"),"R9B":(8,"GPR"),"R10B":(8,"GPR"),"R11B":(8,"GPR"),"R12B":(8,"GPR"),"R13B":(8,"GPR"),"R14B":(8,"GPR"),"R15B":(8,"GPR"), + "AX":(16,"GPR"), "BC":(16,"GPR"), "CX":(16,"GPR"), "DX":(16,"GPR"), "BP":(16,"GPR"), "SI":(16,"GPR"), "DI":(16,"GPR"), "SP":(16,"GPR"), "R8W":(16,"GPR"), "R9W":(16,"GPR"), "R10W":(16,"GPR"), "R11W":(16,"GPR"), "R12W":(16,"GPR"), "R13W":(16,"GPR"), "R14W":(16,"GPR"), "R15W":(16,"GPR"), + "EAX":(32,"GPR"), "EBX":(32,"GPR"), "ECX":(32,"GPR"), "EDX":(32,"GPR"), "EBP":(32,"GPR"), "ESI":(32,"GPR"), "EDI":(32,"GPR"), "ESP":(32,"GPR"), "R8D":(32,"GPR"), "R9D":(32,"GPR"), "R10D":(32,"GPR"), "R11D":(32,"GPR"), "R12D":(32,"GPR"), "R13D":(32,"GPR"), "R14D":(32,"GPR"), "R15D":(32,"GPR"), + "RAX":(64,"GPR"), "RBX":(64,"GPR"), "RCX":(64,"GPR"), "RDX":(64,"GPR"), "RBP":(64,"GPR"), "RSI":(64,"GPR"), "RDI":(64,"GPR"), "RSP":(64,"GPR"), "R8":(64,"GPR"), "R9":(64,"GPR"), "R10":(64,"GPR"), "R11":(64,"GPR"), "R12":(64,"GPR"), "R13":(64,"GPR"), "R14":(64,"GPR"), "R15":(64,"GPR"), + "CS":(16,"GPR"), "DS":(16,"GPR"), "SS":(16,"GPR"), "ES":(16,"GPR"), "FS":(16,"GPR"), "GS":(16,"GPR"), + "EFLAGS":(32,"GPR"), "RFLAGS":(64,"GPR"), "EIP":(32,"GPR"), "RIP":(64,"GPR"), +#FPU Registers + "ST0":(80,"FPU"),"ST1":(80,"FPU"),"ST2":(80,"FPU"),"ST3":(80,"FPU"),"ST4":(80,"FPU"),"ST5":(80,"FPU"),"ST6":(80,"FPU"),"ST7":(80,"FPU"), +#MMX Registers + "MM0":(64,"MMX"),"MM1":(64,"MMX"),"MM2":(64,"MMX"),"MM3":(64,"MMX"),"MM4":(64,"MMX"),"MM5":(64,"MMX"),"MM6":(64,"MMX"),"MM7":(64,"MMX"), +#XMM Registers + "XMM0":(128,"XMM"),"XMM1":(128,"XMM"),"XMM2":(128,"XMM"),"XMM3":(128,"XMM"),"XMM4":(128,"XMM"),"XMM5":(128,"XMM"),"XMM6":(128,"XMM"),"XMM7":(128,"XMM"), "XMM8":(128,"XMM"), "XMM9":(128,"XMM"), "XMM10":(128,"XMM"), "XMM11":(128,"XMM"), "XMM12":(128,"XMM"), "XMM13":(128,"XMM"), "XMM14":(128,"XMM"), "XMM15":(128,"XMM"), "XMM16":(128,"XMM"), "XMM17":(128,"XMM"), "XMM18":(128,"XMM"), "XMM19":(128,"XMM"), "XMM20":(128,"XMM"), "XMM21":(128,"XMM"), "XMM22":(128,"XMM"), "XMM23":(128,"XMM"), "XMM24":(128,"XMM"), "XMM25":(128,"XMM"), "XMM26":(128,"XMM"), "XMM27":(128,"XMM"), "XMM28":(128,"XMM"), "XMM29":(128,"XMM"), "XMM30":(128,"XMM"), "XMM31":(128,"XMM"), +#YMM Registers + "YMM0":(256,"YMM"),"YMM1":(256,"YMM"),"YMM2":(256,"YMM"),"YMM3":(256,"YMM"),"YMM4":(256,"YMM"),"YMM5":(256,"YMM"),"YMM6":(256,"YMM"),"YMM7":(256,"YMM"), "YMM8":(256,"YMM"), "YMM9":(256,"YMM"), "YMM10":(256,"YMM"), "YMM11":(256,"YMM"), "YMM12":(256,"YMM"), "YMM13":(256,"YMM"), "YMM14":(256,"YMM"), "YMM15":(256,"YMM"), "YMM16":(256,"YMM"), "YMM17":(256,"YMM"), "YMM18":(256,"YMM"), "YMM19":(256,"YMM"), "YMM20":(256,"YMM"), "YMM21":(256,"YMM"), "YMM22":(256,"YMM"), "YMM23":(256,"YMM"), "YMM24":(256,"YMM"), "YMM25":(256,"YMM"), "YMM26":(256,"YMM"), "YMM27":(256,"YMM"), "YMM28":(256,"YMM"), "YMM29":(256,"YMM"), "YMM30":(256,"YMM"), "YMM31":(256,"YMM"), +#ZMM Registers + "ZMM0":(512,"ZMM"),"ZMM1":(512,"ZMM"),"ZMM2":(512,"ZMM"),"ZMM3":(512,"ZMM"),"ZMM4":(512,"ZMM"),"ZMM5":(512,"ZMM"),"ZMM6":(512,"ZMM"),"ZMM7":(512,"ZMM"), "ZMM8":(512,"ZMM"), "ZMM9":(512,"ZMM"), "ZMM10":(512,"ZMM"), "ZMM11":(512,"ZMM"), "ZMM12":(512,"ZMM"), "ZMM13":(512,"ZMM"), "ZMM14":(512,"ZMM"), "ZMM15":(512,"ZMM"), "ZMM16":(512,"ZMM"), "ZMM17":(512,"ZMM"), "ZMM18":(512,"ZMM"), "ZMM19":(512,"ZMM"), "ZMM20":(512,"ZMM"), "ZMM21":(512,"ZMM"), "ZMM22":(512,"ZMM"), "ZMM23":(512,"ZMM"), "ZMM24":(512,"ZMM"), "ZMM25":(512,"ZMM"), "ZMM26":(512,"ZMM"), "ZMM27":(512,"ZMM"), "ZMM28":(512,"ZMM"), "ZMM29":(512,"ZMM"), "ZMM30":(512,"ZMM"), "ZMM31":(512,"ZMM"), +#Opmask Register + "K0":(64,"K"), "K1":(64,"K"), "K2":(64,"K"), "K3":(64,"K"), "K4":(64,"K"), "K5":(64,"K"), "K6":(64,"K"), "K7":(64,"K"), +#Bounds Registers + "BND0":(128,"BND"),"BND1":(128,"BND"),"BND2":(128,"BND"),"BND3":(128,"BND") +#Registers in gerneral + "GPR8":(8,"GPR"), "GPR16":(16,"GPR"), "GPR32":(32,"GPR"), "GPR64":(64,"GPR"), "FPU":(80,"FPU"), "MMX":(64,"MMX"), "XMM":(128,"XMM"), "YMM":(256,"YMM"), "ZMM":(512,"ZMM"), "K":(64,"K"), "BND":(128,"BND") + } + + def __init__(self,name,mask=False): + self.name = name.upper() + self.mask = mask +# try: + if[name in self.sizes]: + self.size = self.sizes[self.name][0] + self.reg_type = self.sizes[self.name][1] + else: + print(lncnt) + raise NameError("Register name not in dictionary: "+self.name) +# except KeyError: +# print(lncnt) + + def print(self): + opmask = "" + if(self.mask): + opmask = "{opmask}" + return(self.reg_type+opmask) diff --git a/Testcase.py b/Testcase.py new file mode 100755 index 0000000..2745921 --- /dev/null +++ b/Testcase.py @@ -0,0 +1,255 @@ +#!/apps/python/3.5-anaconda/bin/python + +import os +from subprocess import call +from math import ceil +from Params import Register + +class Testcase(object): + +##------------------Constant variables-------------------------- +# Lookup tables for regs + gprs64 = ['rax', 'rbx', 'rcx', 'rdx', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15'] + gprs32 = ['eax', 'ebx', 'ecx', 'edx', 'r9d', 'r10d', 'r11d', 'r12d', 'r13d', 'r14d', 'r15d'] + gprs16 = ['ax', 'bx', 'cx', 'dx', 'r9w', 'r10w', 'r11w', 'r12w', 'r13w', 'r14w', 'r15w'] + gprs8 = ['al', 'bl', 'cl', 'dl', 'r9l', 'r10l', 'r11l', 'r12l', 'r13l', 'r14l', 'r15l'] + fpus = ['st0', 'st1', 'st2', 'st3', 'st4', 'st5', 'st6', 'st7'] + mmxs = ['mm0', 'mm1', 'mm2', 'mm3', 'mm4', 'mm5', 'mm6', 'mm7'] + ks = ['k0', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7'] + bnds = ['bnd0', 'bnd1', 'bnd2', 'bnd3', 'bnd4', 'bnd5', 'bnd6', 'bnd7'] + xmms = ['xmm0', 'xmm1', 'xmm2', 'xmm3', 'xmm4', 'xmm5', 'xmm6', 'xmm7', 'xmm8', 'xmm9', + 'xmm10', 'xmm11', 'xmm12', 'xmm13', 'xmm14', 'xmm15', 'xmm16', 'xmm17', 'xmm18', 'xmm19', + 'xmm20', 'xmm21', 'xmm22', 'xmm23', 'xmm24', 'xmm25', 'xmm26', 'xmm27', 'xmm28', 'xmm29', + 'xmm30', 'xmm31'] + ymms = ['ymm0', 'ymm1', 'ymm2', 'ymm3', 'ymm4', 'ymm5', 'ymm6', 'ymm7', 'ymm8', 'ymm9', + 'ymm10', 'ymm11', 'ymm12', 'ymm13', 'ymm14', 'ymm15', 'ymm16', 'ymm17', 'ymm18', 'ymm19', + 'ymm20', 'ymm21', 'ymm22', 'ymm23', 'ymm24', 'ymm25', 'ymm26', 'ymm27', 'ymm28', 'ymm29', + 'ymm30', 'ymm31'] + zmms = ['zmm0', 'zmm1', 'zmm2', 'zmm3', 'zmm4', 'zmm5', 'zmm6', 'zmm7', 'zmm8', 'zmm9', + 'zmm10', 'zmm11', 'zmm12', 'zmm13', 'zmm14', 'zmm15', 'zmm16', 'zmm17', 'zmm18', 'zmm19', + 'zmm20', 'zmm21', 'zmm22', 'zmm23', 'zmm24', 'zmm25', 'zmm26', 'zmm27', 'zmm28', 'zmm29', + 'zmm30', 'zmm31'] + + ops = {'gpr64':gprs64, 'gpr32':gprs32, 'gpr16':gprs16, 'gpr8':gprs8, 'fpu':fpus, 'mmx':mmxs, 'k':ks, 'bnd':bnds, 'xmm':xmms, 'ymm':ymms, 'zmm':zmms} + +# Create Single Precision 1.0 + sp1 = '\t\t# create SP 1.0\n' + sp1 += '\t\tvpcmpeqw xmm0, xmm0, xmm0\n' + sp1 += '\t\tvpslld xmm0, xmm0, 25\t\t\t# logical left shift: 11111110..0 (25=32-(8-1))\n' + sp1 += '\t\tvpsrld xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading mantissa bit is zero\n' + sp1 += '\t\t# copy SP 1.0\n' +# Create Double Precision 1.0 + dp1 = '\t\t# create DP 1.0\n' + dp1 += '\t\tvpcmpeqw xmm0, xmm0, xmm0\t\t# all ones\n' + dp1 += '\t\tvpsllq xmm0, xmm0, 54\t\t\t# logical left shift: 11111110..0 (54=64-(10-1))\n' + dp1 += '\t\tvpsrlq xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading mantissa bit is zero\n' +# Create epilogue + done = ('done:\n' + '\t\tmov\trsp, rbp\n' + '\t\tpop\trbp\n' + '\t\tret\n' + '.size latency, .-latency') +##---------------------------------------------------------------- + +# Constructor + def __init__(self, _mnemonic, _param_list, _num_instr='12'): + self.instr = _mnemonic.lower() + self.param_list = _param_list +# num_instr must be an even number + self.num_instr = str(ceil(int(_num_instr)/2)*2) +# Check for the number of operands and initialise the GPRs if necessary + self.reg_a, self.reg_b, self.reg_c, self.gprPush, self.gprPop, self.zeroGPR, self.copy = self.__define_regs() + self.num_regs = len(self.param_list) + +# Create asm header + self.def_instr, self.ninstr, self.init, self.expand = self.__define_header() +# Create latency and throughput loop + self.loop_lat = self.__define_loop_lat() + self.loop_thrpt = self.__define_loop_thrpt() + + + def write_testcase(self): + regs = self.param_list + extension = '' +# Add operands + extension += ('-'+(self.reg_a if ('gpr' not in self.reg_a) else 'r'+self.reg_a[3:]) + ('_') + + (self.reg_b if ('gpr' not in self.reg_b) else 'r'+self.reg_b[3:]) + ('_') + + (self.reg_c if ('gpr' not in self.reg_c) else 'r'+self.reg_c[3:])) +# Write latency file + call(['mkdir', '-p', 'testcases']) + f = open('./testcases/'+self.instr+extension+'.S', 'w') + data = (self.def_instr+self.ninstr+self.init+self.dp1+self.expand+self.gprPush+self.zeroGPR+self.copy+self.loop_lat+self.gprPop+self.done) + f.write(data) + f.close() +# Write throughput file + f = open('./testcases/'+self.instr+extension+'-TP.S', 'w') + data = (self.def_instr+self.ninstr+self.init+self.dp1+self.expand+self.gprPush+self.zeroGPR+self.copy+self.loop_thrpt+self.gprPop+self.done) + f.write(data) + f.close() + + +# Check register + def __define_regs(self): + regs = self.param_list + reg_a, reg_b, reg_c = ('', '', '') + gprPush, gprPop, zeroGPR = ('', '', '') + reg_a = regs[0].reg_type.lower() + if(reg_a == 'gpr'): + gprPush, gprPop, zeroGPR = self.__initialise_gprs() + reg_a += str(regs[0].size) + if(len(regs) > 1): + reg_b = regs[1].reg_type.lower() + if(reg_b == 'gpr'): + reg_b += str(regs[1].size) + if('gpr' not in reg_a): + gprPush, gprPop, zeroGPR = self.__initialise_gprs() + if(len(regs) == 3): + reg_c = regs[2].reg_type.lower() + if(reg_c == 'gpr'): + reg_c += str(regs[2].size) + if(('gpr' not in reg_a) and ('gpr'not in reg_b)): + gprPush, gprPop, zeroGPR = self.__initialise_gprs() + if(len(regs) == 1): + copy = self.__copy_regs(regs[0]) + else: + copy = self.__copy_regs(regs[1]) + return (reg_a, reg_b, reg_c, gprPush, gprPop, zeroGPR, copy) + + +# Initialise 11 general purpose registers and set them to zero + def __initialise_gprs(self): + gprPush = '' + gprPop = '' + zeroGPR = '' + for reg in self.gprs64: + gprPush += '\t\tpush {}\n'.format(reg) + for reg in reversed(self.gprs64): + gprPop += '\t\tpop {}\n'.format(reg) + for reg in self.gprs64: + zeroGPR += '\t\txor {}, {}\n'.format(reg, reg) + return (gprPush, gprPop, zeroGPR) + + +# Copy created values in specific register + def __copy_regs(self, reg): + copy = '\t\t# copy DP 1.0\n' +# Different handling for GPR, MMX and SSE/AVX registers + if(reg.reg_type == 'GPR'): + copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0]) + copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][1]) + copy += '\t\t# Create DP 2.0\n' + copy += '\t\tadd {}, {}\n'.format(self.ops['gpr64'][1], self.ops['gpr64'][0]) + copy += '\t\t# Create DP 0.5\n' + copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0]) + copy += '\t\tmovq {}, {}\n'.format(self.ops['gpr64'][2], self.ops['gpr64'][0]) + copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0]) + elif(reg.reg_type == 'MMX'): + copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][0]) + copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][1]) + copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0]) + copy += '\t\t# Create DP 2.0\n' + copy += '\t\tadd {}, {}\n'.format(ops['mmx'][1], ops['mmx'][0]) + copy += '\t\t# Create DP 0.5\n' + copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0]) + copy += '\t\tmovq {}, {}\n'.format(self.ops['mmx'][2], self.ops['gpr64'][0]) + elif(reg.reg_type == 'XMM' or reg.reg_type == 'YMM' or reg.reg_type == 'ZMM'): + key = reg.reg_type.lower() + copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][0], self.ops[key][0]) + copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][1], self.ops[key][0]) + copy += '\t\t# Create DP 2.0\n' + copy += '\t\tvaddpd {}, {}, {}\n'.format(self.ops[key][1], self.ops[key][1], self.ops[key][1]) + copy += '\t\t# Create DP 0.5\n' + copy += '\t\tvdivpd {}, {}, {}\n'.format(self.ops[key][2], self.ops[key][0], self.ops[key][1]) + else: + copy = '' + return copy + + + def __define_header(self): + def_instr = '#define INSTR '+self.instr+'\n' + ninstr = '#define NINST '+self.num_instr+'\n' + init = ('#define N edi\n' \ + '#define i r8d\n\n\n' + '.intel_syntax noprefix\n' + '.globl ninst\n' + '.data\n' + 'ninst:\n' + '.long NINST\n' + '.text\n' + '.globl latency\n' + '.type latency, @function\n' + '.align 32\n' + 'latency:\n' + '\t\tpush\trbp\n' + '\t\tmov\trbp, rsp\n' + '\t\txor\ti, i\n' + '\t\ttest\tN, N\n' + '\t\tjle\tdone\n') +# Expand to AVX(512) if necessary + expand = '' + if(self.reg_a == 'ymm' or self.reg_b == 'ymm' or self.reg_c == 'ymm'): + expand = ('\t\t# expand from SSE to AVX\n' + '\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n') + if(self.reg_a == 'zmm' or self.reg_b == 'zmm' or self.reg_c == 'zmm'): + expand = ('\t\t# expand from SSE to AVX\n' + '\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n' + '\t\t# expand from AVX to AVX512\n' + '\t\tvinsert64x4 zmm0, zmm0, ymm0, 0x1\n') + return (def_instr, ninstr, init, expand) + +# Create latency loop + def __define_loop_lat(self): + loop_lat = ('loop:\n' + '\t\tinc i\n') + if(self.num_regs == 1): + for i in range(0, int(self.num_instr)): + loop_lat += '\t\tINSTR {}\n'.format(self.ops[self.reg_a][0]) + elif(self.num_regs == 2 and self.reg_a == self.reg_b): + for i in range(0, int(self.num_instr), 2): + loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][1]) + loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.reg_b][1], self.ops[self.reg_b][0]) + elif(self.num_regs == 2 and self.reg_a != self.reg_b): + for i in range(0, int(self.num_instr), 2): + loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][0]) + loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][0]) + elif(self.num_regs == 3 and self.reg_a == self.reg_b): + for i in range(0, int(self.num_instr), 2): + loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][1], self.ops[self.reg_c][0]) + loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.reg_a][1], self.ops[self.reg_b][0], self.ops[self.reg_c][0]) + elif(self.num_regs == 3 and self.reg_a == self.reg_c): + for i in range(0, int(self.num_instr), 2): + loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][0], self.ops[self.reg_c][0]) + loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.reg_a][1], self.ops[self.reg_b][0], self.ops[self.reg_c][0]) + loop_lat += ('\t\tcmp i, N\n' + '\t\tjl loop\n') + return loop_lat + +# Create throughput loop + def __define_loop_thrpt(self): + loop_thrpt = ('loop:\n' + '\t\tinc i\n') + ext = '' + ext1 = False + ext2 = False + if(self.num_regs == 2): + ext1 = True + if(self.num_regs == 3): + ext1 = True + ext2 = True + for i in range(0, int(self.num_instr)): + if(ext1): + ext = ', {}'.format(self.ops[self.reg_b][i%3]) + if(ext2): + ext += ', {}'.format(self.ops[self.reg_c][i%3]) + regNum = i%len(self.ops[self.reg_a]) if (i > 2) else (i+3)%len(self.ops[self.reg_a]) + loop_thrpt += '\t\tINSTR {}{}\n'.format(self.ops[self.reg_a][regNum], ext) + loop_thrpt += ('\t\tcmp i, N\n' + '\t\tjl loop\n') + return loop_thrpt + + + def __is_in_dir(self, name, path): + for root, dirs, files in os.walk(path): + if name in files: + return True + return False diff --git a/data/ivb_throughput.csv b/data/ivb_throughput.csv new file mode 100644 index 0000000..f2f89b2 --- /dev/null +++ b/data/ivb_throughput.csv @@ -0,0 +1,42 @@ +instr,clock_cycles +vmovapd-TP,0.84 +vaddsd-TP,1.016 +inc-TP,0.446 +cmp-TP,0.447 +inc-rrxmm-TP,0.446 +cmp-rrxmm-TP,0.446 +vmovq-TP,1.17 +vmovsd-TP,1.17 +xor-TP,0.336 +vxorpd-avx-TP,0.335 +vmovq-rxmmxmm-TP,1.004 +vxorps-TP,0.336 +vunpckhpd-TP,1.177 +test-TP,0.446 +vmulsd-TP,1.0170000000000001 +test-rrxmm-TP,0.446 +add-TP,0.47200000000000003 +neg-TP,0.447 +add-rrxmm-TP,0.47100000000000003 +mov-TP,0.386 +mov-rrxmm-TP,0.37 +vaddpd-avx-TP,1.016 +xor-rrxmm-TP,0.336 +sub-TP,0.335 +sub-rrxmm-TP,0.336 +vxorpd-TP,0.336 +vmovapd-avx-TP,0.8370000000000001 +vmulpd-avx-TP,1.021 +vsubsd-TP,1.014 +vmovaps-TP,0.836 +vaddpd-TP,1.015 +vsubpd-avx-TP,1.014 +dec-TP,0.447 +lea-TP,0.5 +jb-TP,0.447 +vmulss-xmmxmmxmm-TP,1.0 +vaddss-xmmxmmxmm-TP,1.0 +vcvtsi2ss-xmmxmmr-TP,1.0859999999999999 +xor-rr-TP,0.413 +vxorps-xmmxmmxmm-TP,0.3333333333333333 +inc-rxmmxmm-TP,0.390 diff --git a/get_instr.py b/get_instr.py new file mode 100755 index 0000000..9820442 --- /dev/null +++ b/get_instr.py @@ -0,0 +1,339 @@ +#!/apps/python/3.5-anaconda/bin/python +import sys +import re +from Testcase import * + +marker = r'//STARTLOOP' +asm_line = re.compile(r'\s[0-9a-f]+[:]') +numSeps = 0 +sem = 0 +db = {} +sorted_db = [] +lncnt = 1 +#cnt=0 +fname = "" +cntChar = '' +first = True + +def extract_instr(asmFile): + global once + global lncnt + global fname + fname = asmFile +#Check if parameter is in the correct file format + if(asmFile[-4:] != ".log"): + print("Invalid argument") + sys.exit() +#Open file + try: + f=open(asmFile, "r") + except IOError: + print("IOError: File not found") +#Analyse code line by line and check the instructions + lncnt = 1 + for line in f: + check_line(line) + lncnt += 1 + f.close() + + +def check_line(line): + global numSeps + global sem + global first +#Check if marker is in line and count the number of whitespaces if so + if(marker in line): +#But first, check if high level code ist indented with whitespaces or tabs + if(first): + set_counter_char(line) + first = False + numSeps = (re.split(marker,line)[0]).count(cntChar) + sem = 2; + elif(sem > 0): +#We're in the marked code snipped +#Check if the line is ASM code and - if not - check if we're still in the loop + match = re.search(asm_line, line) + if(match): +#Further analysis of instructions +# print("".join(re.split(r'\t',line)[-1:]),end="") +#Check if there are commetns in line + if(r'//' in line): + return + check_instr("".join(re.split(r'\t',line)[-1:])) + elif((re.split(r'\S',line)[0]).count(cntChar) <= numSeps): +#Not in the loop anymore - or yet - so we decrement the semaphore + sem = sem-1 + +#Check if seperator is either tabulator or whitespace +def set_counter_char(line): + global cntChar + numSpaces = (re.split(marker,line)[0]).count(" ") + numTabs = (re.split(marker,line)[0]).count("\t") + if(numSpaces != 0 and numTabs == 0): + cntChar = ' ' + elif(numSpaces == 0 and numTabs != 0): + cntChar = '\t' + else: + raise NotImplementedError("Indentation of code is only supported for whitespaces and tabs.") + + +def check_instr(instr): + global db + global lncnt + global cnt + global fname +#Check for strange clang padding bytes + while(instr.startswith("data32")): + instr = instr[7:] +#Seperate mnemonic and operands + mnemonic = instr.split()[0] + params = "".join(instr.split()[1:]) +#Check if line is not only a byte + empty_byte = re.compile(r'[0-9a-f]{2}') + if(re.match(empty_byte, mnemonic) and len(mnemonic) == 2): + return +#Check if there's one or more operand and store all in a list + param_list = flatten(separate_params(params)) + regList = list(param_list) +#Check operands and seperate them by IMMEDIATE (IMD), REGISTER (REG), MEMORY (MEM) or LABEL (LBL) + for i in range(len(param_list)): + op = param_list[i] + if(len(op) <= 0): + op = Parameter("NONE") + elif(op[0] == '$'): + op = Parameter("IMD") + elif(op[0] == '%' and '(' not in op): + j = len(op) + opmask = False + if('{' in op): + j = op.index('{') + opmask = True + op = Register(op[1:j], opmask) + elif('<' in op): + op = Parameter("LBL") + else: + op = MemAddr(op) + param_list[i] = op.print() + regList[i] = op +#Join mnemonic and operand(s) to an instruction form + if(len(mnemonic) > 7): + tabs = "\t" + else: + tabs = "\t\t" + instr_form = mnemonic+tabs+(" ".join(param_list)) +#Check in database for instruction form and increment the counter + if(instr_form in db): + db[instr_form] = db[instr_form]+1 + else: + db[instr_form] = 1 +#Create testcase for instruction form, since it is the first appearance of it +#But (as far as now) only for instr forms with only registers as operands + is_Reg = True + for par in regList: +# print(par.print()+" is Register: "+str(isinstance(par, Register))) + if(not isinstance(par, Register)): + is_Reg = False + if(is_Reg): + #print(mnemonic) +# print("create testcase for "+mnemonic+" with params:") +# for p in regList: +# print(p.print(),end=", ") +# print() +#Create testcase with reversed param list, due to the fact its intel syntax! +# create_testcase(mnemonic, list(reversed(regList))) + tc = Testcase(mnemonic, list(reversed(regList)), '24') + tc.write_testcase() +# print("-----------") + +def separate_params(params): + param_list = [params] + if(',' in params): + if(')' in params): + if(params.index(')') < len(params)-1 and params[params.index(')')+1] == ','): + i = params.index(')')+1 + elif(params.index('(') < params.index(',')): + return param_list + else: + i = params.index(',') + else: + i = params.index(',') + param_list = [params[:i],separate_params(params[i+1:])] + elif('#' in params): + i = params.index('#') + param_list = [params[:i]] + return param_list + + +def sort_db(): + global sorted_db + sorted_db=sorted(db.items(), key=lambda x:x[1], reverse=True) + + +def print_sorted_db(): + sort_db() + sum = 0 + print("Number of\tmnemonic") + print("calls\n") + for i in range(len(sorted_db)): + print(str(sorted_db[i][1])+"\t\t"+sorted_db[i][0]) + sum += sorted_db[i][1] + print("\nCumulated number of instructions: "+str(sum)) + + +def save_db(): + global db + file = open(".cnt_asm_ops.db","w") + for i in db.items(): + file.write(i[0]+"\t"+str(i[1])+"\n") + file.close() + + +def load_db(): + global db + try: + file = open(".cnt_asm_ops.db", "r") + except FileNotFoundError: + print("no database found in current directory") + return + for line in file: + mnemonic = line.split('\t')[0] +#Join mnemonic and operand(s) to an instruction form + if(len(mnemonic) > 7): + tabs = "\t" + params = line.split('\t')[1] + numCalls = line.split("\t")[2][:-1] + else: + tabs = "\t\t" + params = line.split('\t')[2] + numCalls = line.split("\t")[3][:-1] + instr_form = mnemonic+tabs+params + db[instr_form] = int(numCalls) + file.close() + + +def flatten(l): + if l == []: + return l + if(isinstance(l[0], list)): + return flatten(l[0]) + flatten(l[1:]) + return l[:1] + flatten(l[1:]) + + + + +class Parameter(object): + type_list = ["REG", "MEM", "IMD", "LBL", "NONE"] + def __init__(self, ptype, name=""): + self.ptype = ptype.upper() + if(self.ptype not in self.type_list): + raise NameError("Type not supported: "+ptype) + + def print(self): + if(self.ptype == "NONE"): + return "" + else: + return self.ptype + +class MemAddr(Parameter): + segment_regs = ["CS", "DS", "SS", "ES", "FS", "GS"] + scales = [1, 2, 4, 8] + def __init__(self, name): + self.sreg = False + self.offset = False + self.base = False + self.index = False + self.scale = False + if(':' in name): + if(name[1:name.index(':')].upper() not in self.segment_regs): + raise NameError("Type not supported: "+name) + self.sreg = True + self.offset = True + if('(' not in name or ('(' in name and name.index('(') != 0)): + self.offset = True + if('(' in name): + self.parentheses = name[name.index('(')+1:-1] + self.commacnt = self.parentheses.count(',') + if(self.commacnt == 0): + self.base = True + elif(self.commacnt == 2 and int(self.parentheses[-1:]) in self.scales): + self.base = True + self.index = True + self.scale = True + else: + raise NameError("Type not supported: "+name) + + def print(self): + self.mem_format = "MEM(" + if(self.sreg): + self.mem_format += "sreg:" + if(self.offset): + self.mem_format += "offset" + if(self.base and not self.index): + self.mem_format += "(base)" + elif(self.base and self.index and self.scale): + self.mem_format += "(base, index, scale)" + self.mem_format += ")" + return self.mem_format + + + +class Register(Parameter): + sizes = { +#General Purpose Registers + "AH":(8,"GPR"), "AL":(8,"GPR"), "BH":(8,"GPR"), "BL":(8,"GPR"), "CH":(8,"GPR"), "CL":(8,"GPR"), "DH":(8,"GPR"), "DL":(8,"GPR"), "BPL":(8,"GPR"), "SIL":(8,"GPR"), "DIL":(8,"GPR"), "SPL":(8,"GPR"), "R8L":(8,"GPR"), "R9L":(8,"GPR"), "R10L":(8,"GPR"), "R11L":(8,"GPR"), "R12L":(8,"GPR"), "R13L":(8,"GPR"), "R14L":(8,"GPR"), "R15L":(8,"GPR"), + "R8B":(8,"GPR"),"R9B":(8,"GPR"),"R10B":(8,"GPR"),"R11B":(8,"GPR"),"R12B":(8,"GPR"),"R13B":(8,"GPR"),"R14B":(8,"GPR"),"R15B":(8,"GPR"), + "AX":(16,"GPR"), "BC":(16,"GPR"), "CX":(16,"GPR"), "DX":(16,"GPR"), "BP":(16,"GPR"), "SI":(16,"GPR"), "DI":(16,"GPR"), "SP":(16,"GPR"), "R8W":(16,"GPR"), "R9W":(16,"GPR"), "R10W":(16,"GPR"), "R11W":(16,"GPR"), "R12W":(16,"GPR"), "R13W":(16,"GPR"), "R14W":(16,"GPR"), "R15W":(16,"GPR"), + "EAX":(32,"GPR"), "EBX":(32,"GPR"), "ECX":(32,"GPR"), "EDX":(32,"GPR"), "EBP":(32,"GPR"), "ESI":(32,"GPR"), "EDI":(32,"GPR"), "ESP":(32,"GPR"), "R8D":(32,"GPR"), "R9D":(32,"GPR"), "R10D":(32,"GPR"), "R11D":(32,"GPR"), "R12D":(32,"GPR"), "R13D":(32,"GPR"), "R14D":(32,"GPR"), "R15D":(32,"GPR"), + "RAX":(64,"GPR"), "RBX":(64,"GPR"), "RCX":(64,"GPR"), "RDX":(64,"GPR"), "RBP":(64,"GPR"), "RSI":(64,"GPR"), "RDI":(64,"GPR"), "RSP":(64,"GPR"), "R8":(64,"GPR"), "R9":(64,"GPR"), "R10":(64,"GPR"), "R11":(64,"GPR"), "R12":(64,"GPR"), "R13":(64,"GPR"), "R14":(64,"GPR"), "R15":(64,"GPR"), + "CS":(16,"GPR"), "DS":(16,"GPR"), "SS":(16,"GPR"), "ES":(16,"GPR"), "FS":(16,"GPR"), "GS":(16,"GPR"), + "EFLAGS":(32,"GPR"), "RFLAGS":(64,"GPR"), "EIP":(32,"GPR"), "RIP":(64,"GPR"), +#FPU Registers + "ST0":(80,"FPU"),"ST1":(80,"FPU"),"ST2":(80,"FPU"),"ST3":(80,"FPU"),"ST4":(80,"FPU"),"ST5":(80,"FPU"),"ST6":(80,"FPU"),"ST7":(80,"FPU"), +#MMX Registers + "MM0":(64,"MMX"),"MM1":(64,"MMX"),"MM2":(64,"MMX"),"MM3":(64,"MMX"),"MM4":(64,"MMX"),"MM5":(64,"MMX"),"MM6":(64,"MMX"),"MM7":(64,"MMX"), +#XMM Registers + "XMM0":(128,"XMM"),"XMM1":(128,"XMM"),"XMM2":(128,"XMM"),"XMM3":(128,"XMM"),"XMM4":(128,"XMM"),"XMM5":(128,"XMM"),"XMM6":(128,"XMM"),"XMM7":(128,"XMM"), "XMM8":(128,"XMM"), "XMM9":(128,"XMM"), "XMM10":(128,"XMM"), "XMM11":(128,"XMM"), "XMM12":(128,"XMM"), "XMM13":(128,"XMM"), "XMM14":(128,"XMM"), "XMM15":(128,"XMM"), "XMM16":(128,"XMM"), "XMM17":(128,"XMM"), "XMM18":(128,"XMM"), "XMM19":(128,"XMM"), "XMM20":(128,"XMM"), "XMM21":(128,"XMM"), "XMM22":(128,"XMM"), "XMM23":(128,"XMM"), "XMM24":(128,"XMM"), "XMM25":(128,"XMM"), "XMM26":(128,"XMM"), "XMM27":(128,"XMM"), "XMM28":(128,"XMM"), "XMM29":(128,"XMM"), "XMM30":(128,"XMM"), "XMM31":(128,"XMM"), +#YMM Registers + "YMM0":(256,"YMM"),"YMM1":(256,"YMM"),"YMM2":(256,"YMM"),"YMM3":(256,"YMM"),"YMM4":(256,"YMM"),"YMM5":(256,"YMM"),"YMM6":(256,"YMM"),"YMM7":(256,"YMM"), "YMM8":(256,"YMM"), "YMM9":(256,"YMM"), "YMM10":(256,"YMM"), "YMM11":(256,"YMM"), "YMM12":(256,"YMM"), "YMM13":(256,"YMM"), "YMM14":(256,"YMM"), "YMM15":(256,"YMM"), "YMM16":(256,"YMM"), "YMM17":(256,"YMM"), "YMM18":(256,"YMM"), "YMM19":(256,"YMM"), "YMM20":(256,"YMM"), "YMM21":(256,"YMM"), "YMM22":(256,"YMM"), "YMM23":(256,"YMM"), "YMM24":(256,"YMM"), "YMM25":(256,"YMM"), "YMM26":(256,"YMM"), "YMM27":(256,"YMM"), "YMM28":(256,"YMM"), "YMM29":(256,"YMM"), "YMM30":(256,"YMM"), "YMM31":(256,"YMM"), +#ZMM Registers + "ZMM0":(512,"ZMM"),"ZMM1":(512,"ZMM"),"ZMM2":(512,"ZMM"),"ZMM3":(512,"ZMM"),"ZMM4":(512,"ZMM"),"ZMM5":(512,"ZMM"),"ZMM6":(512,"ZMM"),"ZMM7":(512,"ZMM"), "ZMM8":(512,"ZMM"), "ZMM9":(512,"ZMM"), "ZMM10":(512,"ZMM"), "ZMM11":(512,"ZMM"), "ZMM12":(512,"ZMM"), "ZMM13":(512,"ZMM"), "ZMM14":(512,"ZMM"), "ZMM15":(512,"ZMM"), "ZMM16":(512,"ZMM"), "ZMM17":(512,"ZMM"), "ZMM18":(512,"ZMM"), "ZMM19":(512,"ZMM"), "ZMM20":(512,"ZMM"), "ZMM21":(512,"ZMM"), "ZMM22":(512,"ZMM"), "ZMM23":(512,"ZMM"), "ZMM24":(512,"ZMM"), "ZMM25":(512,"ZMM"), "ZMM26":(512,"ZMM"), "ZMM27":(512,"ZMM"), "ZMM28":(512,"ZMM"), "ZMM29":(512,"ZMM"), "ZMM30":(512,"ZMM"), "ZMM31":(512,"ZMM"), +#Opmask Register + "K0":(64,"K"), "K1":(64,"K"), "K2":(64,"K"), "K3":(64,"K"), "K4":(64,"K"), "K5":(64,"K"), "K6":(64,"K"), "K7":(64,"K"), +#Bounds Registers + "BND0":(128,"BND"),"BND1":(128,"BND"),"BND2":(128,"BND"),"BND3":(128,"BND") + } + + def __init__(self,name,mask=False): + self.name = name.upper() + self.mask = mask +# try: + if[name in self.sizes]: + self.size = self.sizes[self.name][0] + self.reg_type = self.sizes[self.name][1] + else: + print(lncnt) + raise NameError("Register name not in dictionary: "+self.name) +# except KeyError: +# print(lncnt) + + def print(self): + opmask = "" + if(self.mask): + opmask = "{opmask}" + return(self.reg_type+str(self.size)+opmask) + + + +if __name__ == "__main__": +# load_db() + r0 = Register("ymm0") + r1 = Register("xmm0") + r2 = Register("rax") +# create_testcase("VMOVQ", [r1,r2]) +# create_testcase("VADDPD", [r0, r0, r0]) + if(len(sys.argv) > 1): + for i in range(1,len(sys.argv)): + extract_instr(sys.argv[i]) + print_sorted_db() + +# save_db() diff --git a/osaca.py b/osaca.py new file mode 100755 index 0000000..dedf3f9 --- /dev/null +++ b/osaca.py @@ -0,0 +1,432 @@ +#!/apps/python/3.5-anaconda/bin/python + +import argparse +import sys +import subprocess +import os +import re +import Params +import pandas as pd +from datetime import datetime + + +#----------Global variables-------------- +arch = '' +archList = ['SNB','IVB','HSW', 'BDW', 'SKL'] +filepath = '' +srcCode = '' +marker = r'//STARTLOOP' +asm_line = re.compile(r'\s[0-9a-f]+[:]') +numSeps = 0 +sem = 0 +firstAppearance = True +lncnt = 0 +instrForms = list() +df = '' +output = '' +horizontalSeparator = '' +total_tp = 0 +longestInstr = 30 +cycList = [] +reciList = [] +#--------------------------------------- + +# Check if the architecture arg is valid +def check_arch(): + if(arch in archList): + return True + else: + return False + +# Check if the given filepath exists and if the format is the needed elf64 +def check_elffile(): + if(os.path.isfile(filepath)): + create_elffile() + if('file format elf64' in srcCode[1]): + return True + return False + +# Check if the given filepath exists +def check_file(): + if(os.path.isfile(filepath)): + get_file() + return True + return False + +# Load binary file in variable srcCode and separate by line +def create_elffile(): + global srcCode + srcCode = subprocess.run(['objdump', '--source', filepath], stdout=subprocess.PIPE).stdout.decode('utf-8').split('\n') + +# Load arbitrary file in variable srcCode and separate by line +def get_file(): + global srcCode + try: + f = open(filepath, 'r') + except IOError: + print('IOError: file \'{}\' not found'.format(filepath)) + for line in f: + srcCode += line + f.close() + srcCode = srcCode.split('\n') + + +def check_line(line): + global numSeps + global sem + global firstAppearance +# Check if marker is in line + if(marker in line): +# First, check if high level code in indented with whitespaces or tabs + if(firstAppearance): + set_char_counter(line) + firstAppearance = False +# Now count the number of whitespaces + numSeps = (re.split(marker, line)[0]).count(cntChar) + sem = 2 + elif(sem > 0): +# We're in the marked code snippet +# Check if the line is ASM code and - if not - check if we're still in the loop + match = re.search(asm_line, line) + if(match): +# Further analysis of instructions +# Check if there are comments in line + if(r'//' in line): + return + check_instr(''.join(re.split(r'\t', line)[-1:])) + elif((re.split(r'\S', line)[0]).count(cntChar) <= numSeps): +# Not in the loop anymore - or yet. We decrement the semaphore + sem = sem-1 + + +# Check if separators are either tabulators or whitespaces +def set_char_counter(line): + global cntChar + numSpaces = (re.split(marker, line)[0]).count(' ') + numTabs = (re.split(marker, line)[0]).count('\t') + if(numSpaces != 0 and numTabs == 0): + cntChar = ' ' + elif(numSpaces == 0 and numTabs != 0): + cntChar = '\t' + else: + raise NotImplementedError('Indentation of code is only supported for whitespaces and tabs.') + + +def check_instr(instr): + global instrForms + global longestInstr +# Check for strange clang padding bytes + while(instr.startswith('data32')): + instr = instr[7:] +# Separate mnemonic and operands + mnemonic = instr.split()[0] + params = ''.join(instr.split()[1:]) +# Check if line is not only a byte + empty_byte = re.compile(r'[0-9a-f]{2}') + if(re.match(empty_byte, mnemonic) and len(mnemonic) == 2): + return +# Check if there's one or more operands and store all in a list + param_list = flatten(separate_params(params)) + param_list_types = list(param_list) +# check operands and separate them by IMMEDIATE (IMD), REGISTER (REG). MEMORY (MEM) or LABEL(LBL) + for i in range(len(param_list)): + op = param_list[i] + if(len(op) <= 0): + op = Params.Parameter('NONE') + elif(op[0] == '$'): + op = Params.Parameter('IMD') + elif(op[0] == '%' and '(' not in op): + j = len(op) + opmask = False + if('{' in op): + j = op.index('{') + opmask = True + op = Params.Register(op[1:j], opmask) + elif('<' in op): + op = Params.Parameter('LBL') + else: + op = Params.MemAddr(op) + param_list[i] = op.print() + param_list_types[i] = op +#Add to list + if(len(instr) > longestInstr): + longestInstr = len(instr) + instrForm = [mnemonic]+list(reversed(param_list_types))+[instr] + instrForms.append(instrForm) + +def separate_params(params): + param_list = [params] + if(',' in params): + if(')' in params): + if(params.index(')') < len(params)-1 and params[params.index(')')+1] == ','): + i = params.index(')')+1 + elif(params.index('(') < params.index(',')): + return param_list + else: + i = params.index(',') + else: + i = params.index(',') + param_list = [params[:i],separate_params(params[i+1:])] + elif('#' in params): + i = params.index('#') + param_list = [params[:i]] + return param_list + +def flatten(l): + if l == []: + return l + if(isinstance(l[0], list)): + return flatten(l[0]) + flatten(l[1:]) + return l[:1] + flatten(l[1:]) + +def read_csv(): + global df + df = pd.read_csv('data/'+arch.lower()+'_throughput.csv') + +def create_horiz_sep(): + global horizontalSeparator + horizontalSeparator = '-'*(longestInstr+8) + +def create_output(): + global total_tp + global output + global longestInstr + warning = False + +#Check the output alignment depending on the longest instruction + if(longestInstr > 70): + longestInstr = 70 + create_horiz_sep() + ws = ' '*(len(horizontalSeparator)-23) +# Write general information about the benchmark + output = ( '--'+horizontalSeparator+'\n' + '| Analyzing of file:\t'+os.getcwd()+'/'+filepath+'\n' + '| Architecture:\t\t'+arch+'\n' + '| Timestamp:\t\t'+datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'\n' + '|\n| INSTRUCTION'+ws+'CLOCK CYCLES\n' + '| '+horizontalSeparator+'\n|\n') +# Check for the throughput data in CSV +# First determine if we're searching for the SSE, AVX or AVX512 type of instruction + for elem in instrForms: + extension = '' + avx = False + avx512 = False + opExt = [] + for i in range(1, len(elem)-1): + opExt.append('r'+str(elem[i].size) if (isinstance(elem[i], Params.Register) and elem[i].reg_type == 'GPR') else elem[i].print().lower()) +# Due to the fact we store the explicit operands, we don't need anyu avx/avx512 extension +# for op in elem[1:-1]: +# if(isinstance(op,Params.Register) and op.reg_type == 'YMM'): +# avx = True +# elif(isinstance(op,Params.Register) and op.reg_type == 'ZMM'): +# avx512 = True +# break +# if(avx512): +# extension = '-avx512' +# elif(avx): +# extension = '-avx' + operands = '_'.join(opExt) +# Now look up the value in the dataframe +# Check if there is a stored throughput value in database + series = df['instr'].str.contains(elem[0]+'-'+operands+'-TP') + if( True in series.values): +# It's a match! + notFound = False + try: + tp = df[df.instr == elem[0]+'-'+operands+'-TP'].clock_cycles.values[0] + except IndexError: +# Something went wrong + print('Error while fetching data from database') + continue +# Did not found the exact instruction form. +# Try to find the instruction form for register operands only + else: + opExtRegs = [] + for operand in opExt: + try: + regTmp = Register(operand) + opExtRegs.append(True) + except KeyError: + opExtRegs.append(False) + pass + if(not True in opExtRegs): +# No register in whole instruction form. How can I found out what regsize we need? + print('Feature not included yet') + tp = 0 + notFound = True + warning = True + continue + if(opExtRegs[0] == False): +# Instruction stores result in memory. Check for storing in register instead + if(len(opExt) > 1): + if(opExtRegs[1] == True): + opExt[0] = opExt[1] + elif(len(optExt > 2): + if(opExtRegs[2] == True): + opExt[0] = opExt[2] + if(len(opExtRegs) == 2 and opExtRegs[1] == False): +# Instruction loads value from memory and has only two operands. Check for loading from register instead + if(opExtRegs[0] == True): + opExt[1] = opExt[0] + if(len)opExtRegs) == 3 and opExtRegs[2] == False): +# Instruction loads value from memorz and has three operands. Check for loading from register instead + opExt[2] = opExt[0] + operands = '_'.join(opExt) +# Check for register equivalent instruction + series = df['instr'].str.contains(elem[0]+'-'+operands+'-TP') + if( True in series.values): +# It's a match! + notFound = False + try: + tp = df[df.instr == elem[0]+'-'+operands+'-TP'].clock_cycles.values[0] + except IndexError: +# Something went wrong + print('Error while fetching data from database') + continue +# Did not found the register instruction form. Set warning and go on with throughput 0 + else: + tp = 0 + notFound = True + warning = True +# Add it to the overall throughput + total_tp += tp +# Check the alignement again + numWhitespaces = longestInstr-len(elem[-1]) + ws = ' '*numWhitespaces+'| ' + n_f = '' + if(notFound): + n_f = ' '*(5-len(str(tp)))+'*' + data = '| '+elem[-1]+ws+str(tp)+n_f+'\n' + output += data +# Finally write the total throughput + numWhitespaces = longestInstr-27 + ws = ' '+' '*numWhitespaces + output += ( '| '+horizontalSeparator+'\n' + '| TOTAL ESTIMATED THROUGHPUT:'+ws+str(total_tp)) + if(warning): + output += ('\n\n* There was no throughput value found ' + 'for the specific instruction form.' + '\n Please create a testcase via the create_testcase-method ' + 'or add a value manually.') + +def create_sequences(): + global cycList + global reciList + + for i in range(1, 101): + cycList.append(i) + reciList.append(1/i) + +def validate_TP(clkC, instr): + for i in range(0, 100): + if(cycList[i]*1.05 > float(clkC) and cycList[i]*0.95 < float(clkC)): +# Value is probably correct, so round it to the estimated value + return cycList[i] + elif(reciList[i]*1.05 > float(clkC) and reciList[i]*0.95 < float(clkC)): +# Value is probably correct, so round it to the estimated value + return reciList[i] +# No value close to an integer or its reciprokal found, we assume the measurement is incorrect + print('Your measurement for {} is probably wrong. Please inspect your benchmark!'.format(instr)) + print('The program will continue with the given value') + return clkC + +def write_csv(csv): + try: + f = open('data/'+arch.lower()+'_throughput.csv', 'w') + except IOError: + print('IOError: file \'{}\' not found in ./data'.format(arch.lower()+'_throughput.csv')) + f.write(csv) + f.close() + +##---------------main functions depending on arguments---------------------- + +#reads ibench output and includes it in the architecture specific csv file +def include_ibench(): + global df + +# Check args and exit program if something's wrong + if(not check_arch()): + print('Invalid microarchitecture.') + sys.exit() + if(not check_file()): + print('Invalid file path or file format.') + sys.exit() +# Check for database for the chosen architecture + read_csv() +# Create sequence of numbers and their reciprokals for validate the measurements + create_sequences() + + print('Everything seems fine! Let\'s start checking!') + newData = [] + for line in srcCode: + if('TP' in line): +# We found a command with a throughput value. Get instruction and the number of clock cycles + instr = line.split()[0][:-1] + clkC = line.split()[1] + clkC = validate_TP(clkC, instr) + tp = -1 + new = False + try: + tp = df.loc[lambda df: df.instr == instr,'clock_cycles'].values[0] + except IndexError: +# Instruction not in database yet --> add it + newData.append([instr,clkC]) + new = True + pass + if(not new and tp != clkC): + print('Different measurement for {}: {}(old) vs. {}(new)\nPlease check for correctness (no changes were made).'.format(instr, tp, clkC)) +# Now merge the DataFrames and write new csv file + df = df.append(pd.DataFrame(newData, columns=['instr','clock_cycles']), ignore_index=True) + csv = df.to_csv(index=False) + write_csv(csv) + print('ibench output {} successfully in database included.'.format(filepath.split('/')[-1])) + + +# main function of the tool +def inspect_binary(): +# Check args and exit program if something's wrong + if(not check_arch()): + print('Invalid microarchitecture.') + sys.exit() + if(not check_elffile()): + print('Invalid file path or file format.') + sys.exit() +# Finally check for database for the chosen architecture + read_csv() + + print('Everything seems fine! Let\'s start checking!') + for line in srcCode: + lncnt += 1 + check_line(line) + create_output() + print(output) + +##------------------------------------------------------------------------------ +##------------Main method-------------- +def main(): + global lncnt + global inp + global arch + global filepath +# Parse args + parser = argparse.ArgumentParser(description='Analyzes a marked innermost loop snippet for a given architecture type and prints out the estimated average throughput') + parser.add_argument('--version', '-V', action='version', version='%(prog)s 0.1') + parser.add_argument('--arch', dest='arch', type=str, help='define architecture') + parser.add_argument('filepath', type=str, help='path to object (Binary, CSV)') + parser.add_argument('--include-ibench', '-i', dest='incl', action='store_true', help='includes the given values in form of the output of ibench in the database') + +# Store args in global variables + inp = parser.parse_args() + arch = inp.arch.upper() + filepath = inp.filepath + inclIbench = inp.incl + + if(inclIbench): + include_ibench() + else: + inspect_binary() + + +##------------Main method-------------- +if __name__ == '__main__': + main() diff --git a/testcases/add-rr-TP.S b/testcases/add-rr-TP.S new file mode 100644 index 0000000..4f40830 --- /dev/null +++ b/testcases/add-rr-TP.S @@ -0,0 +1,100 @@ +#define INSTR add +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR edx, eax + INSTR r9d, ebx + INSTR r10d, ecx + INSTR r11d, eax + INSTR r12d, ebx + INSTR r13d, ecx + INSTR r14d, eax + INSTR r15d, ebx + INSTR eax, ecx + INSTR ebx, eax + INSTR ecx, ebx + INSTR edx, ecx + INSTR r9d, eax + INSTR r10d, ebx + INSTR r11d, ecx + INSTR r12d, eax + INSTR r13d, ebx + INSTR r14d, ecx + INSTR r15d, eax + INSTR eax, ebx + INSTR ebx, ecx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/add-rr.S b/testcases/add-rr.S new file mode 100644 index 0000000..1dc4adf --- /dev/null +++ b/testcases/add-rr.S @@ -0,0 +1,100 @@ +#define INSTR add +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + INSTR eax, ebx + INSTR ebx, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-rr-TP.S b/testcases/cmp-rr-TP.S new file mode 100644 index 0000000..d2b943a --- /dev/null +++ b/testcases/cmp-rr-TP.S @@ -0,0 +1,100 @@ +#define INSTR cmp +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rax, rcx + INSTR rbx, rax + INSTR rcx, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rax, rbx + INSTR rbx, rcx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/cmp-rr.S b/testcases/cmp-rr.S new file mode 100644 index 0000000..7e5ee2c --- /dev/null +++ b/testcases/cmp-rr.S @@ -0,0 +1,100 @@ +#define INSTR cmp +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/dec-r-TP.S b/testcases/dec-r-TP.S new file mode 100644 index 0000000..a281110 --- /dev/null +++ b/testcases/dec-r-TP.S @@ -0,0 +1,100 @@ +#define INSTR dec +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx + INSTR r9d + INSTR r10d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR eax + INSTR ebx + INSTR ecx + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR eax + INSTR ebx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/dec-r.S b/testcases/dec-r.S new file mode 100644 index 0000000..53cf598 --- /dev/null +++ b/testcases/dec-r.S @@ -0,0 +1,100 @@ +#define INSTR dec +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/inc-r-TP.S b/testcases/inc-r-TP.S new file mode 100644 index 0000000..8c57e5e --- /dev/null +++ b/testcases/inc-r-TP.S @@ -0,0 +1,100 @@ +#define INSTR inc +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx + INSTR r9 + INSTR r10 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rax + INSTR rbx + INSTR rcx + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rax + INSTR rbx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/inc-r.S b/testcases/inc-r.S new file mode 100644 index 0000000..4f918c7 --- /dev/null +++ b/testcases/inc-r.S @@ -0,0 +1,100 @@ +#define INSTR inc +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/janadd-r64r32-TP.S b/testcases/janadd-r64r32-TP.S new file mode 100644 index 0000000..f6fd008 --- /dev/null +++ b/testcases/janadd-r64r32-TP.S @@ -0,0 +1,82 @@ +#define INSTR janadd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/janadd-r64r32.S b/testcases/janadd-r64r32.S new file mode 100644 index 0000000..5568030 --- /dev/null +++ b/testcases/janadd-r64r32.S @@ -0,0 +1,82 @@ +#define INSTR janadd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/janadd-rr-TP.S b/testcases/janadd-rr-TP.S new file mode 100644 index 0000000..f6fd008 --- /dev/null +++ b/testcases/janadd-rr-TP.S @@ -0,0 +1,82 @@ +#define INSTR janadd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/janadd-rr.S b/testcases/janadd-rr.S new file mode 100644 index 0000000..5568030 --- /dev/null +++ b/testcases/janadd-rr.S @@ -0,0 +1,82 @@ +#define INSTR janadd +#define NINST 6 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-rr-TP.S b/testcases/mov-rr-TP.S new file mode 100644 index 0000000..72872bc --- /dev/null +++ b/testcases/mov-rr-TP.S @@ -0,0 +1,100 @@ +#define INSTR mov +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rax, rcx + INSTR rbx, rax + INSTR rcx, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rax, rbx + INSTR rbx, rcx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/mov-rr.S b/testcases/mov-rr.S new file mode 100644 index 0000000..b15c313 --- /dev/null +++ b/testcases/mov-rr.S @@ -0,0 +1,100 @@ +#define INSTR mov +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movslq-rr-TP.S b/testcases/movslq-rr-TP.S new file mode 100644 index 0000000..5ee7352 --- /dev/null +++ b/testcases/movslq-rr-TP.S @@ -0,0 +1,100 @@ +#define INSTR movslq +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR rdx, eax + INSTR r9, ebx + INSTR r10, ecx + INSTR r11, eax + INSTR r12, ebx + INSTR r13, ecx + INSTR r14, eax + INSTR r15, ebx + INSTR rax, ecx + INSTR rbx, eax + INSTR rcx, ebx + INSTR rdx, ecx + INSTR r9, eax + INSTR r10, ebx + INSTR r11, ecx + INSTR r12, eax + INSTR r13, ebx + INSTR r14, ecx + INSTR r15, eax + INSTR rax, ebx + INSTR rbx, ecx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movslq-rr.S b/testcases/movslq-rr.S new file mode 100644 index 0000000..b7f3825 --- /dev/null +++ b/testcases/movslq-rr.S @@ -0,0 +1,100 @@ +#define INSTR movslq +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + INSTR rax, eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movzbl-rr-TP.S b/testcases/movzbl-rr-TP.S new file mode 100644 index 0000000..f883521 --- /dev/null +++ b/testcases/movzbl-rr-TP.S @@ -0,0 +1,100 @@ +#define INSTR movzbl +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR edx, al + INSTR r9d, bl + INSTR r10d, cl + INSTR r11d, al + INSTR r12d, bl + INSTR r13d, cl + INSTR r14d, al + INSTR r15d, bl + INSTR eax, cl + INSTR ebx, al + INSTR ecx, bl + INSTR edx, cl + INSTR r9d, al + INSTR r10d, bl + INSTR r11d, cl + INSTR r12d, al + INSTR r13d, bl + INSTR r14d, cl + INSTR r15d, al + INSTR eax, bl + INSTR ebx, cl + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/movzbl-rr.S b/testcases/movzbl-rr.S new file mode 100644 index 0000000..0028005 --- /dev/null +++ b/testcases/movzbl-rr.S @@ -0,0 +1,100 @@ +#define INSTR movzbl +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + INSTR eax, al + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/neg-r-TP.S b/testcases/neg-r-TP.S new file mode 100644 index 0000000..b93faeb --- /dev/null +++ b/testcases/neg-r-TP.S @@ -0,0 +1,100 @@ +#define INSTR neg +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR edx + INSTR r9d + INSTR r10d + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR eax + INSTR ebx + INSTR ecx + INSTR edx + INSTR r9d + INSTR r10d + INSTR r11d + INSTR r12d + INSTR r13d + INSTR r14d + INSTR r15d + INSTR eax + INSTR ebx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/neg-r.S b/testcases/neg-r.S new file mode 100644 index 0000000..88c7f6d --- /dev/null +++ b/testcases/neg-r.S @@ -0,0 +1,100 @@ +#define INSTR neg +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + INSTR eax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/pop-r-TP.S b/testcases/pop-r-TP.S new file mode 100644 index 0000000..9f676b6 --- /dev/null +++ b/testcases/pop-r-TP.S @@ -0,0 +1,100 @@ +#define INSTR pop +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx + INSTR r9 + INSTR r10 + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rax + INSTR rbx + INSTR rcx + INSTR rdx + INSTR r9 + INSTR r10 + INSTR r11 + INSTR r12 + INSTR r13 + INSTR r14 + INSTR r15 + INSTR rax + INSTR rbx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/pop-r.S b/testcases/pop-r.S new file mode 100644 index 0000000..73fcb9a --- /dev/null +++ b/testcases/pop-r.S @@ -0,0 +1,100 @@ +#define INSTR pop +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + INSTR rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/sub-rr-TP.S b/testcases/sub-rr-TP.S new file mode 100644 index 0000000..28fbfc6 --- /dev/null +++ b/testcases/sub-rr-TP.S @@ -0,0 +1,100 @@ +#define INSTR sub +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rax, rcx + INSTR rbx, rax + INSTR rcx, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rax, rbx + INSTR rbx, rcx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/sub-rr.S b/testcases/sub-rr.S new file mode 100644 index 0000000..0eb2c63 --- /dev/null +++ b/testcases/sub-rr.S @@ -0,0 +1,100 @@ +#define INSTR sub +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/test-rr-TP.S b/testcases/test-rr-TP.S new file mode 100644 index 0000000..0a7515d --- /dev/null +++ b/testcases/test-rr-TP.S @@ -0,0 +1,100 @@ +#define INSTR test +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR rdx, rax + INSTR r9, rbx + INSTR r10, rcx + INSTR r11, rax + INSTR r12, rbx + INSTR r13, rcx + INSTR r14, rax + INSTR r15, rbx + INSTR rax, rcx + INSTR rbx, rax + INSTR rcx, rbx + INSTR rdx, rcx + INSTR r9, rax + INSTR r10, rbx + INSTR r11, rcx + INSTR r12, rax + INSTR r13, rbx + INSTR r14, rcx + INSTR r15, rax + INSTR rax, rbx + INSTR rbx, rcx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/test-rr.S b/testcases/test-rr.S new file mode 100644 index 0000000..4a1aa46 --- /dev/null +++ b/testcases/test-rr.S @@ -0,0 +1,100 @@ +#define INSTR test +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + INSTR rax, rbx + INSTR rbx, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddpd-avx-ymmymmymm-TP.S b/testcases/vaddpd-avx-ymmymmymm-TP.S new file mode 100644 index 0000000..88a5fdb --- /dev/null +++ b/testcases/vaddpd-avx-ymmymmymm-TP.S @@ -0,0 +1,67 @@ +#define INSTR vaddpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm16, ymm1, ymm1 + INSTR ymm17, ymm2, ymm2 + INSTR ymm18, ymm0, ymm0 + INSTR ymm19, ymm1, ymm1 + INSTR ymm20, ymm2, ymm2 + INSTR ymm21, ymm0, ymm0 + INSTR ymm22, ymm1, ymm1 + INSTR ymm23, ymm2, ymm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddpd-avx-ymmymmymm.S b/testcases/vaddpd-avx-ymmymmymm.S new file mode 100644 index 0000000..d032dd2 --- /dev/null +++ b/testcases/vaddpd-avx-ymmymmymm.S @@ -0,0 +1,67 @@ +#define INSTR vaddpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddpd-xmmxmmxmm-TP.S b/testcases/vaddpd-xmmxmmxmm-TP.S new file mode 100644 index 0000000..45bee34 --- /dev/null +++ b/testcases/vaddpd-xmmxmmxmm-TP.S @@ -0,0 +1,65 @@ +#define INSTR vaddpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm16, xmm1, xmm1 + INSTR xmm17, xmm2, xmm2 + INSTR xmm18, xmm0, xmm0 + INSTR xmm19, xmm1, xmm1 + INSTR xmm20, xmm2, xmm2 + INSTR xmm21, xmm0, xmm0 + INSTR xmm22, xmm1, xmm1 + INSTR xmm23, xmm2, xmm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddpd-xmmxmmxmm.S b/testcases/vaddpd-xmmxmmxmm.S new file mode 100644 index 0000000..bea987d --- /dev/null +++ b/testcases/vaddpd-xmmxmmxmm.S @@ -0,0 +1,65 @@ +#define INSTR vaddpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddsd-xmmxmmxmm-TP.S b/testcases/vaddsd-xmmxmmxmm-TP.S new file mode 100644 index 0000000..3d04147 --- /dev/null +++ b/testcases/vaddsd-xmmxmmxmm-TP.S @@ -0,0 +1,65 @@ +#define INSTR vaddsd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm16, xmm1, xmm1 + INSTR xmm17, xmm2, xmm2 + INSTR xmm18, xmm0, xmm0 + INSTR xmm19, xmm1, xmm1 + INSTR xmm20, xmm2, xmm2 + INSTR xmm21, xmm0, xmm0 + INSTR xmm22, xmm1, xmm1 + INSTR xmm23, xmm2, xmm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vaddsd-xmmxmmxmm.S b/testcases/vaddsd-xmmxmmxmm.S new file mode 100644 index 0000000..2090c03 --- /dev/null +++ b/testcases/vaddsd-xmmxmmxmm.S @@ -0,0 +1,65 @@ +#define INSTR vaddsd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovapd-avx-ymmymm-TP.S b/testcases/vmovapd-avx-ymmymm-TP.S new file mode 100644 index 0000000..ff74ba3 --- /dev/null +++ b/testcases/vmovapd-avx-ymmymm-TP.S @@ -0,0 +1,67 @@ +#define INSTR vmovapd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0 + INSTR ymm4, ymm1 + INSTR ymm5, ymm2 + INSTR ymm3, ymm0 + INSTR ymm4, ymm1 + INSTR ymm5, ymm2 + INSTR ymm6, ymm0 + INSTR ymm7, ymm1 + INSTR ymm8, ymm2 + INSTR ymm9, ymm0 + INSTR ymm10, ymm1 + INSTR ymm11, ymm2 + INSTR ymm12, ymm0 + INSTR ymm13, ymm1 + INSTR ymm14, ymm2 + INSTR ymm15, ymm0 + INSTR ymm16, ymm1 + INSTR ymm17, ymm2 + INSTR ymm18, ymm0 + INSTR ymm19, ymm1 + INSTR ymm20, ymm2 + INSTR ymm21, ymm0 + INSTR ymm22, ymm1 + INSTR ymm23, ymm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovapd-avx-ymmymm.S b/testcases/vmovapd-avx-ymmymm.S new file mode 100644 index 0000000..0396e83 --- /dev/null +++ b/testcases/vmovapd-avx-ymmymm.S @@ -0,0 +1,67 @@ +#define INSTR vmovapd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + INSTR ymm0, ymm1 + INSTR ymm1, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovapd-xmmxmm-TP.S b/testcases/vmovapd-xmmxmm-TP.S new file mode 100644 index 0000000..acd24a8 --- /dev/null +++ b/testcases/vmovapd-xmmxmm-TP.S @@ -0,0 +1,65 @@ +#define INSTR vmovapd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm16, xmm1 + INSTR xmm17, xmm2 + INSTR xmm18, xmm0 + INSTR xmm19, xmm1 + INSTR xmm20, xmm2 + INSTR xmm21, xmm0 + INSTR xmm22, xmm1 + INSTR xmm23, xmm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovapd-xmmxmm.S b/testcases/vmovapd-xmmxmm.S new file mode 100644 index 0000000..89b66d6 --- /dev/null +++ b/testcases/vmovapd-xmmxmm.S @@ -0,0 +1,65 @@ +#define INSTR vmovapd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovaps-xmmxmm-TP.S b/testcases/vmovaps-xmmxmm-TP.S new file mode 100644 index 0000000..959363a --- /dev/null +++ b/testcases/vmovaps-xmmxmm-TP.S @@ -0,0 +1,65 @@ +#define INSTR vmovaps +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm3, xmm0 + INSTR xmm4, xmm1 + INSTR xmm5, xmm2 + INSTR xmm6, xmm0 + INSTR xmm7, xmm1 + INSTR xmm8, xmm2 + INSTR xmm9, xmm0 + INSTR xmm10, xmm1 + INSTR xmm11, xmm2 + INSTR xmm12, xmm0 + INSTR xmm13, xmm1 + INSTR xmm14, xmm2 + INSTR xmm15, xmm0 + INSTR xmm16, xmm1 + INSTR xmm17, xmm2 + INSTR xmm18, xmm0 + INSTR xmm19, xmm1 + INSTR xmm20, xmm2 + INSTR xmm21, xmm0 + INSTR xmm22, xmm1 + INSTR xmm23, xmm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovaps-xmmxmm.S b/testcases/vmovaps-xmmxmm.S new file mode 100644 index 0000000..9559f9f --- /dev/null +++ b/testcases/vmovaps-xmmxmm.S @@ -0,0 +1,65 @@ +#define INSTR vmovaps +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + INSTR xmm0, xmm1 + INSTR xmm1, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovq-rxmm-TP.S b/testcases/vmovq-rxmm-TP.S new file mode 100644 index 0000000..3b80f46 --- /dev/null +++ b/testcases/vmovq-rxmm-TP.S @@ -0,0 +1,98 @@ +#define INSTR vmovq +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR rdx, xmm0 + INSTR r9, xmm1 + INSTR r10, xmm2 + INSTR rdx, xmm0 + INSTR r9, xmm1 + INSTR r10, xmm2 + INSTR r11, xmm0 + INSTR r12, xmm1 + INSTR r13, xmm2 + INSTR r14, xmm0 + INSTR r15, xmm1 + INSTR rax, xmm2 + INSTR rbx, xmm0 + INSTR rcx, xmm1 + INSTR rdx, xmm2 + INSTR r9, xmm0 + INSTR r10, xmm1 + INSTR r11, xmm2 + INSTR r12, xmm0 + INSTR r13, xmm1 + INSTR r14, xmm2 + INSTR r15, xmm0 + INSTR rax, xmm1 + INSTR rbx, xmm2 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovq-rxmm.S b/testcases/vmovq-rxmm.S new file mode 100644 index 0000000..a1d5c05 --- /dev/null +++ b/testcases/vmovq-rxmm.S @@ -0,0 +1,98 @@ +#define INSTR vmovq +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + INSTR rax, xmm0 + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovq-xmmr-TP.S b/testcases/vmovq-xmmr-TP.S new file mode 100644 index 0000000..c84e892 --- /dev/null +++ b/testcases/vmovq-xmmr-TP.S @@ -0,0 +1,100 @@ +#define INSTR vmovq +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR xmm3, rax + INSTR xmm4, rbx + INSTR xmm5, rcx + INSTR xmm3, rax + INSTR xmm4, rbx + INSTR xmm5, rcx + INSTR xmm6, rax + INSTR xmm7, rbx + INSTR xmm8, rcx + INSTR xmm9, rax + INSTR xmm10, rbx + INSTR xmm11, rcx + INSTR xmm12, rax + INSTR xmm13, rbx + INSTR xmm14, rcx + INSTR xmm15, rax + INSTR xmm16, rbx + INSTR xmm17, rcx + INSTR xmm18, rax + INSTR xmm19, rbx + INSTR xmm20, rcx + INSTR xmm21, rax + INSTR xmm22, rbx + INSTR xmm23, rcx + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovq-xmmr.S b/testcases/vmovq-xmmr.S new file mode 100644 index 0000000..1bfd1ea --- /dev/null +++ b/testcases/vmovq-xmmr.S @@ -0,0 +1,100 @@ +#define INSTR vmovq +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + INSTR xmm0, rax + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovsd-xmmxmmxmm-TP.S b/testcases/vmovsd-xmmxmmxmm-TP.S new file mode 100644 index 0000000..cad7071 --- /dev/null +++ b/testcases/vmovsd-xmmxmmxmm-TP.S @@ -0,0 +1,65 @@ +#define INSTR vmovsd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm16, xmm1, xmm1 + INSTR xmm17, xmm2, xmm2 + INSTR xmm18, xmm0, xmm0 + INSTR xmm19, xmm1, xmm1 + INSTR xmm20, xmm2, xmm2 + INSTR xmm21, xmm0, xmm0 + INSTR xmm22, xmm1, xmm1 + INSTR xmm23, xmm2, xmm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmovsd-xmmxmmxmm.S b/testcases/vmovsd-xmmxmmxmm.S new file mode 100644 index 0000000..2bac0f2 --- /dev/null +++ b/testcases/vmovsd-xmmxmmxmm.S @@ -0,0 +1,65 @@ +#define INSTR vmovsd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulpd-avx-ymmymmymm-TP.S b/testcases/vmulpd-avx-ymmymmymm-TP.S new file mode 100644 index 0000000..0b3b1ad --- /dev/null +++ b/testcases/vmulpd-avx-ymmymmymm-TP.S @@ -0,0 +1,67 @@ +#define INSTR vmulpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm16, ymm1, ymm1 + INSTR ymm17, ymm2, ymm2 + INSTR ymm18, ymm0, ymm0 + INSTR ymm19, ymm1, ymm1 + INSTR ymm20, ymm2, ymm2 + INSTR ymm21, ymm0, ymm0 + INSTR ymm22, ymm1, ymm1 + INSTR ymm23, ymm2, ymm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulpd-avx-ymmymmymm.S b/testcases/vmulpd-avx-ymmymmymm.S new file mode 100644 index 0000000..00279d3 --- /dev/null +++ b/testcases/vmulpd-avx-ymmymmymm.S @@ -0,0 +1,67 @@ +#define INSTR vmulpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulsd-xmmxmmxmm-TP.S b/testcases/vmulsd-xmmxmmxmm-TP.S new file mode 100644 index 0000000..144dce2 --- /dev/null +++ b/testcases/vmulsd-xmmxmmxmm-TP.S @@ -0,0 +1,65 @@ +#define INSTR vmulsd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm16, xmm1, xmm1 + INSTR xmm17, xmm2, xmm2 + INSTR xmm18, xmm0, xmm0 + INSTR xmm19, xmm1, xmm1 + INSTR xmm20, xmm2, xmm2 + INSTR xmm21, xmm0, xmm0 + INSTR xmm22, xmm1, xmm1 + INSTR xmm23, xmm2, xmm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vmulsd-xmmxmmxmm.S b/testcases/vmulsd-xmmxmmxmm.S new file mode 100644 index 0000000..191e7b4 --- /dev/null +++ b/testcases/vmulsd-xmmxmmxmm.S @@ -0,0 +1,65 @@ +#define INSTR vmulsd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vsubpd-avx-ymmymmymm-TP.S b/testcases/vsubpd-avx-ymmymmymm-TP.S new file mode 100644 index 0000000..2ec8183 --- /dev/null +++ b/testcases/vsubpd-avx-ymmymmymm-TP.S @@ -0,0 +1,67 @@ +#define INSTR vsubpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm16, ymm1, ymm1 + INSTR ymm17, ymm2, ymm2 + INSTR ymm18, ymm0, ymm0 + INSTR ymm19, ymm1, ymm1 + INSTR ymm20, ymm2, ymm2 + INSTR ymm21, ymm0, ymm0 + INSTR ymm22, ymm1, ymm1 + INSTR ymm23, ymm2, ymm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vsubpd-avx-ymmymmymm.S b/testcases/vsubpd-avx-ymmymmymm.S new file mode 100644 index 0000000..4c803bd --- /dev/null +++ b/testcases/vsubpd-avx-ymmymmymm.S @@ -0,0 +1,67 @@ +#define INSTR vsubpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vsubsd-xmmxmmxmm-TP.S b/testcases/vsubsd-xmmxmmxmm-TP.S new file mode 100644 index 0000000..c14a8fb --- /dev/null +++ b/testcases/vsubsd-xmmxmmxmm-TP.S @@ -0,0 +1,65 @@ +#define INSTR vsubsd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm16, xmm1, xmm1 + INSTR xmm17, xmm2, xmm2 + INSTR xmm18, xmm0, xmm0 + INSTR xmm19, xmm1, xmm1 + INSTR xmm20, xmm2, xmm2 + INSTR xmm21, xmm0, xmm0 + INSTR xmm22, xmm1, xmm1 + INSTR xmm23, xmm2, xmm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vsubsd-xmmxmmxmm.S b/testcases/vsubsd-xmmxmmxmm.S new file mode 100644 index 0000000..e9dad4c --- /dev/null +++ b/testcases/vsubsd-xmmxmmxmm.S @@ -0,0 +1,65 @@ +#define INSTR vsubsd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vunpckhpd-xmmxmmxmm-TP.S b/testcases/vunpckhpd-xmmxmmxmm-TP.S new file mode 100644 index 0000000..1f5cafe --- /dev/null +++ b/testcases/vunpckhpd-xmmxmmxmm-TP.S @@ -0,0 +1,65 @@ +#define INSTR vunpckhpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm16, xmm1, xmm1 + INSTR xmm17, xmm2, xmm2 + INSTR xmm18, xmm0, xmm0 + INSTR xmm19, xmm1, xmm1 + INSTR xmm20, xmm2, xmm2 + INSTR xmm21, xmm0, xmm0 + INSTR xmm22, xmm1, xmm1 + INSTR xmm23, xmm2, xmm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vunpckhpd-xmmxmmxmm.S b/testcases/vunpckhpd-xmmxmmxmm.S new file mode 100644 index 0000000..7b4a197 --- /dev/null +++ b/testcases/vunpckhpd-xmmxmmxmm.S @@ -0,0 +1,65 @@ +#define INSTR vunpckhpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorpd-avx-ymmymmymm-TP.S b/testcases/vxorpd-avx-ymmymmymm-TP.S new file mode 100644 index 0000000..9e7b830 --- /dev/null +++ b/testcases/vxorpd-avx-ymmymmymm-TP.S @@ -0,0 +1,67 @@ +#define INSTR vxorpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm3, ymm0, ymm0 + INSTR ymm4, ymm1, ymm1 + INSTR ymm5, ymm2, ymm2 + INSTR ymm6, ymm0, ymm0 + INSTR ymm7, ymm1, ymm1 + INSTR ymm8, ymm2, ymm2 + INSTR ymm9, ymm0, ymm0 + INSTR ymm10, ymm1, ymm1 + INSTR ymm11, ymm2, ymm2 + INSTR ymm12, ymm0, ymm0 + INSTR ymm13, ymm1, ymm1 + INSTR ymm14, ymm2, ymm2 + INSTR ymm15, ymm0, ymm0 + INSTR ymm16, ymm1, ymm1 + INSTR ymm17, ymm2, ymm2 + INSTR ymm18, ymm0, ymm0 + INSTR ymm19, ymm1, ymm1 + INSTR ymm20, ymm2, ymm2 + INSTR ymm21, ymm0, ymm0 + INSTR ymm22, ymm1, ymm1 + INSTR ymm23, ymm2, ymm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorpd-avx-ymmymmymm.S b/testcases/vxorpd-avx-ymmymmymm.S new file mode 100644 index 0000000..a1f370d --- /dev/null +++ b/testcases/vxorpd-avx-ymmymmymm.S @@ -0,0 +1,67 @@ +#define INSTR vxorpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # expand from SSE to AVX + vinsertf128 ymm0, ymm0, xmm0, 0x1 + # copy DP 1.0 + vmovaps ymm0, ymm0 + vmovaps ymm1, ymm0 + # Create DP 2.0 + vaddpd ymm1, ymm1, ymm1 + # Create DP 0.5 + vdivpd ymm2, ymm0, ymm1 +loop: + inc i + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + INSTR ymm0, ymm1, ymm0 + INSTR ymm1, ymm0, ymm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorpd-xmmxmmxmm-TP.S b/testcases/vxorpd-xmmxmmxmm-TP.S new file mode 100644 index 0000000..bbacc19 --- /dev/null +++ b/testcases/vxorpd-xmmxmmxmm-TP.S @@ -0,0 +1,65 @@ +#define INSTR vxorpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm16, xmm1, xmm1 + INSTR xmm17, xmm2, xmm2 + INSTR xmm18, xmm0, xmm0 + INSTR xmm19, xmm1, xmm1 + INSTR xmm20, xmm2, xmm2 + INSTR xmm21, xmm0, xmm0 + INSTR xmm22, xmm1, xmm1 + INSTR xmm23, xmm2, xmm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorpd-xmmxmmxmm.S b/testcases/vxorpd-xmmxmmxmm.S new file mode 100644 index 0000000..8783f3c --- /dev/null +++ b/testcases/vxorpd-xmmxmmxmm.S @@ -0,0 +1,65 @@ +#define INSTR vxorpd +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorps-xmmxmmxmm-TP.S b/testcases/vxorps-xmmxmmxmm-TP.S new file mode 100644 index 0000000..d8b097b --- /dev/null +++ b/testcases/vxorps-xmmxmmxmm-TP.S @@ -0,0 +1,65 @@ +#define INSTR vxorps +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm3, xmm0, xmm0 + INSTR xmm4, xmm1, xmm1 + INSTR xmm5, xmm2, xmm2 + INSTR xmm6, xmm0, xmm0 + INSTR xmm7, xmm1, xmm1 + INSTR xmm8, xmm2, xmm2 + INSTR xmm9, xmm0, xmm0 + INSTR xmm10, xmm1, xmm1 + INSTR xmm11, xmm2, xmm2 + INSTR xmm12, xmm0, xmm0 + INSTR xmm13, xmm1, xmm1 + INSTR xmm14, xmm2, xmm2 + INSTR xmm15, xmm0, xmm0 + INSTR xmm16, xmm1, xmm1 + INSTR xmm17, xmm2, xmm2 + INSTR xmm18, xmm0, xmm0 + INSTR xmm19, xmm1, xmm1 + INSTR xmm20, xmm2, xmm2 + INSTR xmm21, xmm0, xmm0 + INSTR xmm22, xmm1, xmm1 + INSTR xmm23, xmm2, xmm2 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/vxorps-xmmxmmxmm.S b/testcases/vxorps-xmmxmmxmm.S new file mode 100644 index 0000000..2309d0c --- /dev/null +++ b/testcases/vxorps-xmmxmmxmm.S @@ -0,0 +1,65 @@ +#define INSTR vxorps +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + # copy DP 1.0 + vmovaps xmm0, xmm0 + vmovaps xmm1, xmm0 + # Create DP 2.0 + vaddpd xmm1, xmm1, xmm1 + # Create DP 0.5 + vdivpd xmm2, xmm0, xmm1 +loop: + inc i + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + INSTR xmm0, xmm1, xmm0 + INSTR xmm1, xmm0, xmm0 + cmp i, N + jl loop +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/xor-rr-TP.S b/testcases/xor-rr-TP.S new file mode 100644 index 0000000..caf6b9d --- /dev/null +++ b/testcases/xor-rr-TP.S @@ -0,0 +1,100 @@ +#define INSTR xor +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR dl, al + INSTR r9l, bl + INSTR r10l, cl + INSTR dl, al + INSTR r9l, bl + INSTR r10l, cl + INSTR r11l, al + INSTR r12l, bl + INSTR r13l, cl + INSTR r14l, al + INSTR r15l, bl + INSTR al, cl + INSTR bl, al + INSTR cl, bl + INSTR dl, cl + INSTR r9l, al + INSTR r10l, bl + INSTR r11l, cl + INSTR r12l, al + INSTR r13l, bl + INSTR r14l, cl + INSTR r15l, al + INSTR al, bl + INSTR bl, cl + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file diff --git a/testcases/xor-rr.S b/testcases/xor-rr.S new file mode 100644 index 0000000..04f32e7 --- /dev/null +++ b/testcases/xor-rr.S @@ -0,0 +1,100 @@ +#define INSTR xor +#define NINST 24 +#define N edi +#define i r8d + + +.intel_syntax noprefix +.globl ninst +.data +ninst: +.long NINST +.text +.globl latency +.type latency, @function +.align 32 +latency: + push rbp + mov rbp, rsp + xor i, i + test N, N + jle done + # create DP 1.0 + vpcmpeqw xmm0, xmm0, xmm0 # all ones + vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1)) + vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero + push rax + push rbx + push rcx + push rdx + push r9 + push r10 + push r11 + push r12 + push r13 + push r14 + push r15 + xor rax, rax + xor rbx, rbx + xor rcx, rcx + xor rdx, rdx + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + # copy DP 1.0 + vmovq rax, xmm0 + vmovq rbx, xmm0 + # Create DP 2.0 + add rbx, rax + # Create DP 0.5 + div rax + movq rcx, rax + vmovq rax, xmm0 +loop: + inc i + INSTR al, bl + INSTR bl, al + INSTR al, bl + INSTR bl, al + INSTR al, bl + INSTR bl, al + INSTR al, bl + INSTR bl, al + INSTR al, bl + INSTR bl, al + INSTR al, bl + INSTR bl, al + INSTR al, bl + INSTR bl, al + INSTR al, bl + INSTR bl, al + INSTR al, bl + INSTR bl, al + INSTR al, bl + INSTR bl, al + INSTR al, bl + INSTR bl, al + INSTR al, bl + INSTR bl, al + cmp i, N + jl loop + pop r15 + pop r14 + pop r13 + pop r12 + pop r11 + pop r10 + pop r9 + pop rdx + pop rcx + pop rbx + pop rax +done: + mov rsp, rbp + pop rbp + ret +.size latency, .-latency \ No newline at end of file