initial upload

This commit is contained in:
Jan Laukemann
2017-07-17 15:29:56 +02:00
parent 183802b110
commit a1dc3b639b
67 changed files with 6265 additions and 0 deletions

103
Params.py Executable file
View File

@@ -0,0 +1,103 @@
#!/apps/python/3.5-anaconda/bin/python
class Parameter(object):
type_list = ["REG", "MEM", "IMD", "LBL", "NONE"]
def __init__(self, ptype, name=""):
self.ptype = ptype.upper()
if(self.ptype not in self.type_list):
raise NameError("Type not supported: "+ptype)
def print(self):
if(self.ptype == "NONE"):
return ""
else:
return self.ptype
class MemAddr(Parameter):
segment_regs = ["CS", "DS", "SS", "ES", "FS", "GS"]
scales = [1, 2, 4, 8]
def __init__(self, name):
self.sreg = False
self.offset = False
self.base = False
self.index = False
self.scale = False
if(':' in name):
if(name[1:name.index(':')].upper() not in self.segment_regs):
raise NameError("Type not supported: "+name)
self.sreg = True
self.offset = True
if('(' not in name or ('(' in name and name.index('(') != 0)):
self.offset = True
if('(' in name):
self.parentheses = name[name.index('(')+1:-1]
self.commacnt = self.parentheses.count(',')
if(self.commacnt == 0):
self.base = True
elif(self.commacnt == 2 and int(self.parentheses[-1:]) in self.scales):
self.base = True
self.index = True
self.scale = True
else:
raise NameError("Type not supported: "+name)
def print(self):
self.mem_format = "MEM("
if(self.sreg):
self.mem_format += "sreg:"
if(self.offset):
self.mem_format += "offset"
if(self.base and not self.index):
self.mem_format += "(base)"
elif(self.base and self.index and self.scale):
self.mem_format += "(base, index, scale)"
self.mem_format += ")"
return self.mem_format
class Register(Parameter):
sizes = {
#General Purpose Registers
"AH":(8,"GPR"), "AL":(8,"GPR"), "BH":(8,"GPR"), "BL":(8,"GPR"), "CH":(8,"GPR"), "CL":(8,"GPR"), "DH":(8,"GPR"), "DL":(8,"GPR"), "BPL":(8,"GPR"), "SIL":(8,"GPR"), "DIL":(8,"GPR"), "SPL":(8,"GPR"), "R8L":(8,"GPR"), "R9L":(8,"GPR"), "R10L":(8,"GPR"), "R11L":(8,"GPR"), "R12L":(8,"GPR"), "R13L":(8,"GPR"), "R14L":(8,"GPR"), "R15L":(8,"GPR"),
"R8B":(8,"GPR"),"R9B":(8,"GPR"),"R10B":(8,"GPR"),"R11B":(8,"GPR"),"R12B":(8,"GPR"),"R13B":(8,"GPR"),"R14B":(8,"GPR"),"R15B":(8,"GPR"),
"AX":(16,"GPR"), "BC":(16,"GPR"), "CX":(16,"GPR"), "DX":(16,"GPR"), "BP":(16,"GPR"), "SI":(16,"GPR"), "DI":(16,"GPR"), "SP":(16,"GPR"), "R8W":(16,"GPR"), "R9W":(16,"GPR"), "R10W":(16,"GPR"), "R11W":(16,"GPR"), "R12W":(16,"GPR"), "R13W":(16,"GPR"), "R14W":(16,"GPR"), "R15W":(16,"GPR"),
"EAX":(32,"GPR"), "EBX":(32,"GPR"), "ECX":(32,"GPR"), "EDX":(32,"GPR"), "EBP":(32,"GPR"), "ESI":(32,"GPR"), "EDI":(32,"GPR"), "ESP":(32,"GPR"), "R8D":(32,"GPR"), "R9D":(32,"GPR"), "R10D":(32,"GPR"), "R11D":(32,"GPR"), "R12D":(32,"GPR"), "R13D":(32,"GPR"), "R14D":(32,"GPR"), "R15D":(32,"GPR"),
"RAX":(64,"GPR"), "RBX":(64,"GPR"), "RCX":(64,"GPR"), "RDX":(64,"GPR"), "RBP":(64,"GPR"), "RSI":(64,"GPR"), "RDI":(64,"GPR"), "RSP":(64,"GPR"), "R8":(64,"GPR"), "R9":(64,"GPR"), "R10":(64,"GPR"), "R11":(64,"GPR"), "R12":(64,"GPR"), "R13":(64,"GPR"), "R14":(64,"GPR"), "R15":(64,"GPR"),
"CS":(16,"GPR"), "DS":(16,"GPR"), "SS":(16,"GPR"), "ES":(16,"GPR"), "FS":(16,"GPR"), "GS":(16,"GPR"),
"EFLAGS":(32,"GPR"), "RFLAGS":(64,"GPR"), "EIP":(32,"GPR"), "RIP":(64,"GPR"),
#FPU Registers
"ST0":(80,"FPU"),"ST1":(80,"FPU"),"ST2":(80,"FPU"),"ST3":(80,"FPU"),"ST4":(80,"FPU"),"ST5":(80,"FPU"),"ST6":(80,"FPU"),"ST7":(80,"FPU"),
#MMX Registers
"MM0":(64,"MMX"),"MM1":(64,"MMX"),"MM2":(64,"MMX"),"MM3":(64,"MMX"),"MM4":(64,"MMX"),"MM5":(64,"MMX"),"MM6":(64,"MMX"),"MM7":(64,"MMX"),
#XMM Registers
"XMM0":(128,"XMM"),"XMM1":(128,"XMM"),"XMM2":(128,"XMM"),"XMM3":(128,"XMM"),"XMM4":(128,"XMM"),"XMM5":(128,"XMM"),"XMM6":(128,"XMM"),"XMM7":(128,"XMM"), "XMM8":(128,"XMM"), "XMM9":(128,"XMM"), "XMM10":(128,"XMM"), "XMM11":(128,"XMM"), "XMM12":(128,"XMM"), "XMM13":(128,"XMM"), "XMM14":(128,"XMM"), "XMM15":(128,"XMM"), "XMM16":(128,"XMM"), "XMM17":(128,"XMM"), "XMM18":(128,"XMM"), "XMM19":(128,"XMM"), "XMM20":(128,"XMM"), "XMM21":(128,"XMM"), "XMM22":(128,"XMM"), "XMM23":(128,"XMM"), "XMM24":(128,"XMM"), "XMM25":(128,"XMM"), "XMM26":(128,"XMM"), "XMM27":(128,"XMM"), "XMM28":(128,"XMM"), "XMM29":(128,"XMM"), "XMM30":(128,"XMM"), "XMM31":(128,"XMM"),
#YMM Registers
"YMM0":(256,"YMM"),"YMM1":(256,"YMM"),"YMM2":(256,"YMM"),"YMM3":(256,"YMM"),"YMM4":(256,"YMM"),"YMM5":(256,"YMM"),"YMM6":(256,"YMM"),"YMM7":(256,"YMM"), "YMM8":(256,"YMM"), "YMM9":(256,"YMM"), "YMM10":(256,"YMM"), "YMM11":(256,"YMM"), "YMM12":(256,"YMM"), "YMM13":(256,"YMM"), "YMM14":(256,"YMM"), "YMM15":(256,"YMM"), "YMM16":(256,"YMM"), "YMM17":(256,"YMM"), "YMM18":(256,"YMM"), "YMM19":(256,"YMM"), "YMM20":(256,"YMM"), "YMM21":(256,"YMM"), "YMM22":(256,"YMM"), "YMM23":(256,"YMM"), "YMM24":(256,"YMM"), "YMM25":(256,"YMM"), "YMM26":(256,"YMM"), "YMM27":(256,"YMM"), "YMM28":(256,"YMM"), "YMM29":(256,"YMM"), "YMM30":(256,"YMM"), "YMM31":(256,"YMM"),
#ZMM Registers
"ZMM0":(512,"ZMM"),"ZMM1":(512,"ZMM"),"ZMM2":(512,"ZMM"),"ZMM3":(512,"ZMM"),"ZMM4":(512,"ZMM"),"ZMM5":(512,"ZMM"),"ZMM6":(512,"ZMM"),"ZMM7":(512,"ZMM"), "ZMM8":(512,"ZMM"), "ZMM9":(512,"ZMM"), "ZMM10":(512,"ZMM"), "ZMM11":(512,"ZMM"), "ZMM12":(512,"ZMM"), "ZMM13":(512,"ZMM"), "ZMM14":(512,"ZMM"), "ZMM15":(512,"ZMM"), "ZMM16":(512,"ZMM"), "ZMM17":(512,"ZMM"), "ZMM18":(512,"ZMM"), "ZMM19":(512,"ZMM"), "ZMM20":(512,"ZMM"), "ZMM21":(512,"ZMM"), "ZMM22":(512,"ZMM"), "ZMM23":(512,"ZMM"), "ZMM24":(512,"ZMM"), "ZMM25":(512,"ZMM"), "ZMM26":(512,"ZMM"), "ZMM27":(512,"ZMM"), "ZMM28":(512,"ZMM"), "ZMM29":(512,"ZMM"), "ZMM30":(512,"ZMM"), "ZMM31":(512,"ZMM"),
#Opmask Register
"K0":(64,"K"), "K1":(64,"K"), "K2":(64,"K"), "K3":(64,"K"), "K4":(64,"K"), "K5":(64,"K"), "K6":(64,"K"), "K7":(64,"K"),
#Bounds Registers
"BND0":(128,"BND"),"BND1":(128,"BND"),"BND2":(128,"BND"),"BND3":(128,"BND")
#Registers in gerneral
"GPR8":(8,"GPR"), "GPR16":(16,"GPR"), "GPR32":(32,"GPR"), "GPR64":(64,"GPR"), "FPU":(80,"FPU"), "MMX":(64,"MMX"), "XMM":(128,"XMM"), "YMM":(256,"YMM"), "ZMM":(512,"ZMM"), "K":(64,"K"), "BND":(128,"BND")
}
def __init__(self,name,mask=False):
self.name = name.upper()
self.mask = mask
# try:
if[name in self.sizes]:
self.size = self.sizes[self.name][0]
self.reg_type = self.sizes[self.name][1]
else:
print(lncnt)
raise NameError("Register name not in dictionary: "+self.name)
# except KeyError:
# print(lncnt)
def print(self):
opmask = ""
if(self.mask):
opmask = "{opmask}"
return(self.reg_type+opmask)

255
Testcase.py Executable file
View File

@@ -0,0 +1,255 @@
#!/apps/python/3.5-anaconda/bin/python
import os
from subprocess import call
from math import ceil
from Params import Register
class Testcase(object):
##------------------Constant variables--------------------------
# Lookup tables for regs
gprs64 = ['rax', 'rbx', 'rcx', 'rdx', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15']
gprs32 = ['eax', 'ebx', 'ecx', 'edx', 'r9d', 'r10d', 'r11d', 'r12d', 'r13d', 'r14d', 'r15d']
gprs16 = ['ax', 'bx', 'cx', 'dx', 'r9w', 'r10w', 'r11w', 'r12w', 'r13w', 'r14w', 'r15w']
gprs8 = ['al', 'bl', 'cl', 'dl', 'r9l', 'r10l', 'r11l', 'r12l', 'r13l', 'r14l', 'r15l']
fpus = ['st0', 'st1', 'st2', 'st3', 'st4', 'st5', 'st6', 'st7']
mmxs = ['mm0', 'mm1', 'mm2', 'mm3', 'mm4', 'mm5', 'mm6', 'mm7']
ks = ['k0', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7']
bnds = ['bnd0', 'bnd1', 'bnd2', 'bnd3', 'bnd4', 'bnd5', 'bnd6', 'bnd7']
xmms = ['xmm0', 'xmm1', 'xmm2', 'xmm3', 'xmm4', 'xmm5', 'xmm6', 'xmm7', 'xmm8', 'xmm9',
'xmm10', 'xmm11', 'xmm12', 'xmm13', 'xmm14', 'xmm15', 'xmm16', 'xmm17', 'xmm18', 'xmm19',
'xmm20', 'xmm21', 'xmm22', 'xmm23', 'xmm24', 'xmm25', 'xmm26', 'xmm27', 'xmm28', 'xmm29',
'xmm30', 'xmm31']
ymms = ['ymm0', 'ymm1', 'ymm2', 'ymm3', 'ymm4', 'ymm5', 'ymm6', 'ymm7', 'ymm8', 'ymm9',
'ymm10', 'ymm11', 'ymm12', 'ymm13', 'ymm14', 'ymm15', 'ymm16', 'ymm17', 'ymm18', 'ymm19',
'ymm20', 'ymm21', 'ymm22', 'ymm23', 'ymm24', 'ymm25', 'ymm26', 'ymm27', 'ymm28', 'ymm29',
'ymm30', 'ymm31']
zmms = ['zmm0', 'zmm1', 'zmm2', 'zmm3', 'zmm4', 'zmm5', 'zmm6', 'zmm7', 'zmm8', 'zmm9',
'zmm10', 'zmm11', 'zmm12', 'zmm13', 'zmm14', 'zmm15', 'zmm16', 'zmm17', 'zmm18', 'zmm19',
'zmm20', 'zmm21', 'zmm22', 'zmm23', 'zmm24', 'zmm25', 'zmm26', 'zmm27', 'zmm28', 'zmm29',
'zmm30', 'zmm31']
ops = {'gpr64':gprs64, 'gpr32':gprs32, 'gpr16':gprs16, 'gpr8':gprs8, 'fpu':fpus, 'mmx':mmxs, 'k':ks, 'bnd':bnds, 'xmm':xmms, 'ymm':ymms, 'zmm':zmms}
# Create Single Precision 1.0
sp1 = '\t\t# create SP 1.0\n'
sp1 += '\t\tvpcmpeqw xmm0, xmm0, xmm0\n'
sp1 += '\t\tvpslld xmm0, xmm0, 25\t\t\t# logical left shift: 11111110..0 (25=32-(8-1))\n'
sp1 += '\t\tvpsrld xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading mantissa bit is zero\n'
sp1 += '\t\t# copy SP 1.0\n'
# Create Double Precision 1.0
dp1 = '\t\t# create DP 1.0\n'
dp1 += '\t\tvpcmpeqw xmm0, xmm0, xmm0\t\t# all ones\n'
dp1 += '\t\tvpsllq xmm0, xmm0, 54\t\t\t# logical left shift: 11111110..0 (54=64-(10-1))\n'
dp1 += '\t\tvpsrlq xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading mantissa bit is zero\n'
# Create epilogue
done = ('done:\n'
'\t\tmov\trsp, rbp\n'
'\t\tpop\trbp\n'
'\t\tret\n'
'.size latency, .-latency')
##----------------------------------------------------------------
# Constructor
def __init__(self, _mnemonic, _param_list, _num_instr='12'):
self.instr = _mnemonic.lower()
self.param_list = _param_list
# num_instr must be an even number
self.num_instr = str(ceil(int(_num_instr)/2)*2)
# Check for the number of operands and initialise the GPRs if necessary
self.reg_a, self.reg_b, self.reg_c, self.gprPush, self.gprPop, self.zeroGPR, self.copy = self.__define_regs()
self.num_regs = len(self.param_list)
# Create asm header
self.def_instr, self.ninstr, self.init, self.expand = self.__define_header()
# Create latency and throughput loop
self.loop_lat = self.__define_loop_lat()
self.loop_thrpt = self.__define_loop_thrpt()
def write_testcase(self):
regs = self.param_list
extension = ''
# Add operands
extension += ('-'+(self.reg_a if ('gpr' not in self.reg_a) else 'r'+self.reg_a[3:]) + ('_') +
(self.reg_b if ('gpr' not in self.reg_b) else 'r'+self.reg_b[3:]) + ('_') +
(self.reg_c if ('gpr' not in self.reg_c) else 'r'+self.reg_c[3:]))
# Write latency file
call(['mkdir', '-p', 'testcases'])
f = open('./testcases/'+self.instr+extension+'.S', 'w')
data = (self.def_instr+self.ninstr+self.init+self.dp1+self.expand+self.gprPush+self.zeroGPR+self.copy+self.loop_lat+self.gprPop+self.done)
f.write(data)
f.close()
# Write throughput file
f = open('./testcases/'+self.instr+extension+'-TP.S', 'w')
data = (self.def_instr+self.ninstr+self.init+self.dp1+self.expand+self.gprPush+self.zeroGPR+self.copy+self.loop_thrpt+self.gprPop+self.done)
f.write(data)
f.close()
# Check register
def __define_regs(self):
regs = self.param_list
reg_a, reg_b, reg_c = ('', '', '')
gprPush, gprPop, zeroGPR = ('', '', '')
reg_a = regs[0].reg_type.lower()
if(reg_a == 'gpr'):
gprPush, gprPop, zeroGPR = self.__initialise_gprs()
reg_a += str(regs[0].size)
if(len(regs) > 1):
reg_b = regs[1].reg_type.lower()
if(reg_b == 'gpr'):
reg_b += str(regs[1].size)
if('gpr' not in reg_a):
gprPush, gprPop, zeroGPR = self.__initialise_gprs()
if(len(regs) == 3):
reg_c = regs[2].reg_type.lower()
if(reg_c == 'gpr'):
reg_c += str(regs[2].size)
if(('gpr' not in reg_a) and ('gpr'not in reg_b)):
gprPush, gprPop, zeroGPR = self.__initialise_gprs()
if(len(regs) == 1):
copy = self.__copy_regs(regs[0])
else:
copy = self.__copy_regs(regs[1])
return (reg_a, reg_b, reg_c, gprPush, gprPop, zeroGPR, copy)
# Initialise 11 general purpose registers and set them to zero
def __initialise_gprs(self):
gprPush = ''
gprPop = ''
zeroGPR = ''
for reg in self.gprs64:
gprPush += '\t\tpush {}\n'.format(reg)
for reg in reversed(self.gprs64):
gprPop += '\t\tpop {}\n'.format(reg)
for reg in self.gprs64:
zeroGPR += '\t\txor {}, {}\n'.format(reg, reg)
return (gprPush, gprPop, zeroGPR)
# Copy created values in specific register
def __copy_regs(self, reg):
copy = '\t\t# copy DP 1.0\n'
# Different handling for GPR, MMX and SSE/AVX registers
if(reg.reg_type == 'GPR'):
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][1])
copy += '\t\t# Create DP 2.0\n'
copy += '\t\tadd {}, {}\n'.format(self.ops['gpr64'][1], self.ops['gpr64'][0])
copy += '\t\t# Create DP 0.5\n'
copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0])
copy += '\t\tmovq {}, {}\n'.format(self.ops['gpr64'][2], self.ops['gpr64'][0])
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
elif(reg.reg_type == 'MMX'):
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][0])
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][1])
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
copy += '\t\t# Create DP 2.0\n'
copy += '\t\tadd {}, {}\n'.format(ops['mmx'][1], ops['mmx'][0])
copy += '\t\t# Create DP 0.5\n'
copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0])
copy += '\t\tmovq {}, {}\n'.format(self.ops['mmx'][2], self.ops['gpr64'][0])
elif(reg.reg_type == 'XMM' or reg.reg_type == 'YMM' or reg.reg_type == 'ZMM'):
key = reg.reg_type.lower()
copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][0], self.ops[key][0])
copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][1], self.ops[key][0])
copy += '\t\t# Create DP 2.0\n'
copy += '\t\tvaddpd {}, {}, {}\n'.format(self.ops[key][1], self.ops[key][1], self.ops[key][1])
copy += '\t\t# Create DP 0.5\n'
copy += '\t\tvdivpd {}, {}, {}\n'.format(self.ops[key][2], self.ops[key][0], self.ops[key][1])
else:
copy = ''
return copy
def __define_header(self):
def_instr = '#define INSTR '+self.instr+'\n'
ninstr = '#define NINST '+self.num_instr+'\n'
init = ('#define N edi\n' \
'#define i r8d\n\n\n'
'.intel_syntax noprefix\n'
'.globl ninst\n'
'.data\n'
'ninst:\n'
'.long NINST\n'
'.text\n'
'.globl latency\n'
'.type latency, @function\n'
'.align 32\n'
'latency:\n'
'\t\tpush\trbp\n'
'\t\tmov\trbp, rsp\n'
'\t\txor\ti, i\n'
'\t\ttest\tN, N\n'
'\t\tjle\tdone\n')
# Expand to AVX(512) if necessary
expand = ''
if(self.reg_a == 'ymm' or self.reg_b == 'ymm' or self.reg_c == 'ymm'):
expand = ('\t\t# expand from SSE to AVX\n'
'\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n')
if(self.reg_a == 'zmm' or self.reg_b == 'zmm' or self.reg_c == 'zmm'):
expand = ('\t\t# expand from SSE to AVX\n'
'\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n'
'\t\t# expand from AVX to AVX512\n'
'\t\tvinsert64x4 zmm0, zmm0, ymm0, 0x1\n')
return (def_instr, ninstr, init, expand)
# Create latency loop
def __define_loop_lat(self):
loop_lat = ('loop:\n'
'\t\tinc i\n')
if(self.num_regs == 1):
for i in range(0, int(self.num_instr)):
loop_lat += '\t\tINSTR {}\n'.format(self.ops[self.reg_a][0])
elif(self.num_regs == 2 and self.reg_a == self.reg_b):
for i in range(0, int(self.num_instr), 2):
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][1])
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.reg_b][1], self.ops[self.reg_b][0])
elif(self.num_regs == 2 and self.reg_a != self.reg_b):
for i in range(0, int(self.num_instr), 2):
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][0])
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][0])
elif(self.num_regs == 3 and self.reg_a == self.reg_b):
for i in range(0, int(self.num_instr), 2):
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][1], self.ops[self.reg_c][0])
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.reg_a][1], self.ops[self.reg_b][0], self.ops[self.reg_c][0])
elif(self.num_regs == 3 and self.reg_a == self.reg_c):
for i in range(0, int(self.num_instr), 2):
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][0], self.ops[self.reg_c][0])
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.reg_a][1], self.ops[self.reg_b][0], self.ops[self.reg_c][0])
loop_lat += ('\t\tcmp i, N\n'
'\t\tjl loop\n')
return loop_lat
# Create throughput loop
def __define_loop_thrpt(self):
loop_thrpt = ('loop:\n'
'\t\tinc i\n')
ext = ''
ext1 = False
ext2 = False
if(self.num_regs == 2):
ext1 = True
if(self.num_regs == 3):
ext1 = True
ext2 = True
for i in range(0, int(self.num_instr)):
if(ext1):
ext = ', {}'.format(self.ops[self.reg_b][i%3])
if(ext2):
ext += ', {}'.format(self.ops[self.reg_c][i%3])
regNum = i%len(self.ops[self.reg_a]) if (i > 2) else (i+3)%len(self.ops[self.reg_a])
loop_thrpt += '\t\tINSTR {}{}\n'.format(self.ops[self.reg_a][regNum], ext)
loop_thrpt += ('\t\tcmp i, N\n'
'\t\tjl loop\n')
return loop_thrpt
def __is_in_dir(self, name, path):
for root, dirs, files in os.walk(path):
if name in files:
return True
return False

42
data/ivb_throughput.csv Normal file
View File

@@ -0,0 +1,42 @@
instr,clock_cycles
vmovapd-TP,0.84
vaddsd-TP,1.016
inc-TP,0.446
cmp-TP,0.447
inc-rrxmm-TP,0.446
cmp-rrxmm-TP,0.446
vmovq-TP,1.17
vmovsd-TP,1.17
xor-TP,0.336
vxorpd-avx-TP,0.335
vmovq-rxmmxmm-TP,1.004
vxorps-TP,0.336
vunpckhpd-TP,1.177
test-TP,0.446
vmulsd-TP,1.0170000000000001
test-rrxmm-TP,0.446
add-TP,0.47200000000000003
neg-TP,0.447
add-rrxmm-TP,0.47100000000000003
mov-TP,0.386
mov-rrxmm-TP,0.37
vaddpd-avx-TP,1.016
xor-rrxmm-TP,0.336
sub-TP,0.335
sub-rrxmm-TP,0.336
vxorpd-TP,0.336
vmovapd-avx-TP,0.8370000000000001
vmulpd-avx-TP,1.021
vsubsd-TP,1.014
vmovaps-TP,0.836
vaddpd-TP,1.015
vsubpd-avx-TP,1.014
dec-TP,0.447
lea-TP,0.5
jb-TP,0.447
vmulss-xmmxmmxmm-TP,1.0
vaddss-xmmxmmxmm-TP,1.0
vcvtsi2ss-xmmxmmr-TP,1.0859999999999999
xor-rr-TP,0.413
vxorps-xmmxmmxmm-TP,0.3333333333333333
inc-rxmmxmm-TP,0.390
1 instr clock_cycles
2 vmovapd-TP 0.84
3 vaddsd-TP 1.016
4 inc-TP 0.446
5 cmp-TP 0.447
6 inc-rrxmm-TP 0.446
7 cmp-rrxmm-TP 0.446
8 vmovq-TP 1.17
9 vmovsd-TP 1.17
10 xor-TP 0.336
11 vxorpd-avx-TP 0.335
12 vmovq-rxmmxmm-TP 1.004
13 vxorps-TP 0.336
14 vunpckhpd-TP 1.177
15 test-TP 0.446
16 vmulsd-TP 1.0170000000000001
17 test-rrxmm-TP 0.446
18 add-TP 0.47200000000000003
19 neg-TP 0.447
20 add-rrxmm-TP 0.47100000000000003
21 mov-TP 0.386
22 mov-rrxmm-TP 0.37
23 vaddpd-avx-TP 1.016
24 xor-rrxmm-TP 0.336
25 sub-TP 0.335
26 sub-rrxmm-TP 0.336
27 vxorpd-TP 0.336
28 vmovapd-avx-TP 0.8370000000000001
29 vmulpd-avx-TP 1.021
30 vsubsd-TP 1.014
31 vmovaps-TP 0.836
32 vaddpd-TP 1.015
33 vsubpd-avx-TP 1.014
34 dec-TP 0.447
35 lea-TP 0.5
36 jb-TP 0.447
37 vmulss-xmmxmmxmm-TP 1.0
38 vaddss-xmmxmmxmm-TP 1.0
39 vcvtsi2ss-xmmxmmr-TP 1.0859999999999999
40 xor-rr-TP 0.413
41 vxorps-xmmxmmxmm-TP 0.3333333333333333
42 inc-rxmmxmm-TP 0.390

339
get_instr.py Executable file
View File

@@ -0,0 +1,339 @@
#!/apps/python/3.5-anaconda/bin/python
import sys
import re
from Testcase import *
marker = r'//STARTLOOP'
asm_line = re.compile(r'\s[0-9a-f]+[:]')
numSeps = 0
sem = 0
db = {}
sorted_db = []
lncnt = 1
#cnt=0
fname = ""
cntChar = ''
first = True
def extract_instr(asmFile):
global once
global lncnt
global fname
fname = asmFile
#Check if parameter is in the correct file format
if(asmFile[-4:] != ".log"):
print("Invalid argument")
sys.exit()
#Open file
try:
f=open(asmFile, "r")
except IOError:
print("IOError: File not found")
#Analyse code line by line and check the instructions
lncnt = 1
for line in f:
check_line(line)
lncnt += 1
f.close()
def check_line(line):
global numSeps
global sem
global first
#Check if marker is in line and count the number of whitespaces if so
if(marker in line):
#But first, check if high level code ist indented with whitespaces or tabs
if(first):
set_counter_char(line)
first = False
numSeps = (re.split(marker,line)[0]).count(cntChar)
sem = 2;
elif(sem > 0):
#We're in the marked code snipped
#Check if the line is ASM code and - if not - check if we're still in the loop
match = re.search(asm_line, line)
if(match):
#Further analysis of instructions
# print("".join(re.split(r'\t',line)[-1:]),end="")
#Check if there are commetns in line
if(r'//' in line):
return
check_instr("".join(re.split(r'\t',line)[-1:]))
elif((re.split(r'\S',line)[0]).count(cntChar) <= numSeps):
#Not in the loop anymore - or yet - so we decrement the semaphore
sem = sem-1
#Check if seperator is either tabulator or whitespace
def set_counter_char(line):
global cntChar
numSpaces = (re.split(marker,line)[0]).count(" ")
numTabs = (re.split(marker,line)[0]).count("\t")
if(numSpaces != 0 and numTabs == 0):
cntChar = ' '
elif(numSpaces == 0 and numTabs != 0):
cntChar = '\t'
else:
raise NotImplementedError("Indentation of code is only supported for whitespaces and tabs.")
def check_instr(instr):
global db
global lncnt
global cnt
global fname
#Check for strange clang padding bytes
while(instr.startswith("data32")):
instr = instr[7:]
#Seperate mnemonic and operands
mnemonic = instr.split()[0]
params = "".join(instr.split()[1:])
#Check if line is not only a byte
empty_byte = re.compile(r'[0-9a-f]{2}')
if(re.match(empty_byte, mnemonic) and len(mnemonic) == 2):
return
#Check if there's one or more operand and store all in a list
param_list = flatten(separate_params(params))
regList = list(param_list)
#Check operands and seperate them by IMMEDIATE (IMD), REGISTER (REG), MEMORY (MEM) or LABEL (LBL)
for i in range(len(param_list)):
op = param_list[i]
if(len(op) <= 0):
op = Parameter("NONE")
elif(op[0] == '$'):
op = Parameter("IMD")
elif(op[0] == '%' and '(' not in op):
j = len(op)
opmask = False
if('{' in op):
j = op.index('{')
opmask = True
op = Register(op[1:j], opmask)
elif('<' in op):
op = Parameter("LBL")
else:
op = MemAddr(op)
param_list[i] = op.print()
regList[i] = op
#Join mnemonic and operand(s) to an instruction form
if(len(mnemonic) > 7):
tabs = "\t"
else:
tabs = "\t\t"
instr_form = mnemonic+tabs+(" ".join(param_list))
#Check in database for instruction form and increment the counter
if(instr_form in db):
db[instr_form] = db[instr_form]+1
else:
db[instr_form] = 1
#Create testcase for instruction form, since it is the first appearance of it
#But (as far as now) only for instr forms with only registers as operands
is_Reg = True
for par in regList:
# print(par.print()+" is Register: "+str(isinstance(par, Register)))
if(not isinstance(par, Register)):
is_Reg = False
if(is_Reg):
#print(mnemonic)
# print("create testcase for "+mnemonic+" with params:")
# for p in regList:
# print(p.print(),end=", ")
# print()
#Create testcase with reversed param list, due to the fact its intel syntax!
# create_testcase(mnemonic, list(reversed(regList)))
tc = Testcase(mnemonic, list(reversed(regList)), '24')
tc.write_testcase()
# print("-----------")
def separate_params(params):
param_list = [params]
if(',' in params):
if(')' in params):
if(params.index(')') < len(params)-1 and params[params.index(')')+1] == ','):
i = params.index(')')+1
elif(params.index('(') < params.index(',')):
return param_list
else:
i = params.index(',')
else:
i = params.index(',')
param_list = [params[:i],separate_params(params[i+1:])]
elif('#' in params):
i = params.index('#')
param_list = [params[:i]]
return param_list
def sort_db():
global sorted_db
sorted_db=sorted(db.items(), key=lambda x:x[1], reverse=True)
def print_sorted_db():
sort_db()
sum = 0
print("Number of\tmnemonic")
print("calls\n")
for i in range(len(sorted_db)):
print(str(sorted_db[i][1])+"\t\t"+sorted_db[i][0])
sum += sorted_db[i][1]
print("\nCumulated number of instructions: "+str(sum))
def save_db():
global db
file = open(".cnt_asm_ops.db","w")
for i in db.items():
file.write(i[0]+"\t"+str(i[1])+"\n")
file.close()
def load_db():
global db
try:
file = open(".cnt_asm_ops.db", "r")
except FileNotFoundError:
print("no database found in current directory")
return
for line in file:
mnemonic = line.split('\t')[0]
#Join mnemonic and operand(s) to an instruction form
if(len(mnemonic) > 7):
tabs = "\t"
params = line.split('\t')[1]
numCalls = line.split("\t")[2][:-1]
else:
tabs = "\t\t"
params = line.split('\t')[2]
numCalls = line.split("\t")[3][:-1]
instr_form = mnemonic+tabs+params
db[instr_form] = int(numCalls)
file.close()
def flatten(l):
if l == []:
return l
if(isinstance(l[0], list)):
return flatten(l[0]) + flatten(l[1:])
return l[:1] + flatten(l[1:])
class Parameter(object):
type_list = ["REG", "MEM", "IMD", "LBL", "NONE"]
def __init__(self, ptype, name=""):
self.ptype = ptype.upper()
if(self.ptype not in self.type_list):
raise NameError("Type not supported: "+ptype)
def print(self):
if(self.ptype == "NONE"):
return ""
else:
return self.ptype
class MemAddr(Parameter):
segment_regs = ["CS", "DS", "SS", "ES", "FS", "GS"]
scales = [1, 2, 4, 8]
def __init__(self, name):
self.sreg = False
self.offset = False
self.base = False
self.index = False
self.scale = False
if(':' in name):
if(name[1:name.index(':')].upper() not in self.segment_regs):
raise NameError("Type not supported: "+name)
self.sreg = True
self.offset = True
if('(' not in name or ('(' in name and name.index('(') != 0)):
self.offset = True
if('(' in name):
self.parentheses = name[name.index('(')+1:-1]
self.commacnt = self.parentheses.count(',')
if(self.commacnt == 0):
self.base = True
elif(self.commacnt == 2 and int(self.parentheses[-1:]) in self.scales):
self.base = True
self.index = True
self.scale = True
else:
raise NameError("Type not supported: "+name)
def print(self):
self.mem_format = "MEM("
if(self.sreg):
self.mem_format += "sreg:"
if(self.offset):
self.mem_format += "offset"
if(self.base and not self.index):
self.mem_format += "(base)"
elif(self.base and self.index and self.scale):
self.mem_format += "(base, index, scale)"
self.mem_format += ")"
return self.mem_format
class Register(Parameter):
sizes = {
#General Purpose Registers
"AH":(8,"GPR"), "AL":(8,"GPR"), "BH":(8,"GPR"), "BL":(8,"GPR"), "CH":(8,"GPR"), "CL":(8,"GPR"), "DH":(8,"GPR"), "DL":(8,"GPR"), "BPL":(8,"GPR"), "SIL":(8,"GPR"), "DIL":(8,"GPR"), "SPL":(8,"GPR"), "R8L":(8,"GPR"), "R9L":(8,"GPR"), "R10L":(8,"GPR"), "R11L":(8,"GPR"), "R12L":(8,"GPR"), "R13L":(8,"GPR"), "R14L":(8,"GPR"), "R15L":(8,"GPR"),
"R8B":(8,"GPR"),"R9B":(8,"GPR"),"R10B":(8,"GPR"),"R11B":(8,"GPR"),"R12B":(8,"GPR"),"R13B":(8,"GPR"),"R14B":(8,"GPR"),"R15B":(8,"GPR"),
"AX":(16,"GPR"), "BC":(16,"GPR"), "CX":(16,"GPR"), "DX":(16,"GPR"), "BP":(16,"GPR"), "SI":(16,"GPR"), "DI":(16,"GPR"), "SP":(16,"GPR"), "R8W":(16,"GPR"), "R9W":(16,"GPR"), "R10W":(16,"GPR"), "R11W":(16,"GPR"), "R12W":(16,"GPR"), "R13W":(16,"GPR"), "R14W":(16,"GPR"), "R15W":(16,"GPR"),
"EAX":(32,"GPR"), "EBX":(32,"GPR"), "ECX":(32,"GPR"), "EDX":(32,"GPR"), "EBP":(32,"GPR"), "ESI":(32,"GPR"), "EDI":(32,"GPR"), "ESP":(32,"GPR"), "R8D":(32,"GPR"), "R9D":(32,"GPR"), "R10D":(32,"GPR"), "R11D":(32,"GPR"), "R12D":(32,"GPR"), "R13D":(32,"GPR"), "R14D":(32,"GPR"), "R15D":(32,"GPR"),
"RAX":(64,"GPR"), "RBX":(64,"GPR"), "RCX":(64,"GPR"), "RDX":(64,"GPR"), "RBP":(64,"GPR"), "RSI":(64,"GPR"), "RDI":(64,"GPR"), "RSP":(64,"GPR"), "R8":(64,"GPR"), "R9":(64,"GPR"), "R10":(64,"GPR"), "R11":(64,"GPR"), "R12":(64,"GPR"), "R13":(64,"GPR"), "R14":(64,"GPR"), "R15":(64,"GPR"),
"CS":(16,"GPR"), "DS":(16,"GPR"), "SS":(16,"GPR"), "ES":(16,"GPR"), "FS":(16,"GPR"), "GS":(16,"GPR"),
"EFLAGS":(32,"GPR"), "RFLAGS":(64,"GPR"), "EIP":(32,"GPR"), "RIP":(64,"GPR"),
#FPU Registers
"ST0":(80,"FPU"),"ST1":(80,"FPU"),"ST2":(80,"FPU"),"ST3":(80,"FPU"),"ST4":(80,"FPU"),"ST5":(80,"FPU"),"ST6":(80,"FPU"),"ST7":(80,"FPU"),
#MMX Registers
"MM0":(64,"MMX"),"MM1":(64,"MMX"),"MM2":(64,"MMX"),"MM3":(64,"MMX"),"MM4":(64,"MMX"),"MM5":(64,"MMX"),"MM6":(64,"MMX"),"MM7":(64,"MMX"),
#XMM Registers
"XMM0":(128,"XMM"),"XMM1":(128,"XMM"),"XMM2":(128,"XMM"),"XMM3":(128,"XMM"),"XMM4":(128,"XMM"),"XMM5":(128,"XMM"),"XMM6":(128,"XMM"),"XMM7":(128,"XMM"), "XMM8":(128,"XMM"), "XMM9":(128,"XMM"), "XMM10":(128,"XMM"), "XMM11":(128,"XMM"), "XMM12":(128,"XMM"), "XMM13":(128,"XMM"), "XMM14":(128,"XMM"), "XMM15":(128,"XMM"), "XMM16":(128,"XMM"), "XMM17":(128,"XMM"), "XMM18":(128,"XMM"), "XMM19":(128,"XMM"), "XMM20":(128,"XMM"), "XMM21":(128,"XMM"), "XMM22":(128,"XMM"), "XMM23":(128,"XMM"), "XMM24":(128,"XMM"), "XMM25":(128,"XMM"), "XMM26":(128,"XMM"), "XMM27":(128,"XMM"), "XMM28":(128,"XMM"), "XMM29":(128,"XMM"), "XMM30":(128,"XMM"), "XMM31":(128,"XMM"),
#YMM Registers
"YMM0":(256,"YMM"),"YMM1":(256,"YMM"),"YMM2":(256,"YMM"),"YMM3":(256,"YMM"),"YMM4":(256,"YMM"),"YMM5":(256,"YMM"),"YMM6":(256,"YMM"),"YMM7":(256,"YMM"), "YMM8":(256,"YMM"), "YMM9":(256,"YMM"), "YMM10":(256,"YMM"), "YMM11":(256,"YMM"), "YMM12":(256,"YMM"), "YMM13":(256,"YMM"), "YMM14":(256,"YMM"), "YMM15":(256,"YMM"), "YMM16":(256,"YMM"), "YMM17":(256,"YMM"), "YMM18":(256,"YMM"), "YMM19":(256,"YMM"), "YMM20":(256,"YMM"), "YMM21":(256,"YMM"), "YMM22":(256,"YMM"), "YMM23":(256,"YMM"), "YMM24":(256,"YMM"), "YMM25":(256,"YMM"), "YMM26":(256,"YMM"), "YMM27":(256,"YMM"), "YMM28":(256,"YMM"), "YMM29":(256,"YMM"), "YMM30":(256,"YMM"), "YMM31":(256,"YMM"),
#ZMM Registers
"ZMM0":(512,"ZMM"),"ZMM1":(512,"ZMM"),"ZMM2":(512,"ZMM"),"ZMM3":(512,"ZMM"),"ZMM4":(512,"ZMM"),"ZMM5":(512,"ZMM"),"ZMM6":(512,"ZMM"),"ZMM7":(512,"ZMM"), "ZMM8":(512,"ZMM"), "ZMM9":(512,"ZMM"), "ZMM10":(512,"ZMM"), "ZMM11":(512,"ZMM"), "ZMM12":(512,"ZMM"), "ZMM13":(512,"ZMM"), "ZMM14":(512,"ZMM"), "ZMM15":(512,"ZMM"), "ZMM16":(512,"ZMM"), "ZMM17":(512,"ZMM"), "ZMM18":(512,"ZMM"), "ZMM19":(512,"ZMM"), "ZMM20":(512,"ZMM"), "ZMM21":(512,"ZMM"), "ZMM22":(512,"ZMM"), "ZMM23":(512,"ZMM"), "ZMM24":(512,"ZMM"), "ZMM25":(512,"ZMM"), "ZMM26":(512,"ZMM"), "ZMM27":(512,"ZMM"), "ZMM28":(512,"ZMM"), "ZMM29":(512,"ZMM"), "ZMM30":(512,"ZMM"), "ZMM31":(512,"ZMM"),
#Opmask Register
"K0":(64,"K"), "K1":(64,"K"), "K2":(64,"K"), "K3":(64,"K"), "K4":(64,"K"), "K5":(64,"K"), "K6":(64,"K"), "K7":(64,"K"),
#Bounds Registers
"BND0":(128,"BND"),"BND1":(128,"BND"),"BND2":(128,"BND"),"BND3":(128,"BND")
}
def __init__(self,name,mask=False):
self.name = name.upper()
self.mask = mask
# try:
if[name in self.sizes]:
self.size = self.sizes[self.name][0]
self.reg_type = self.sizes[self.name][1]
else:
print(lncnt)
raise NameError("Register name not in dictionary: "+self.name)
# except KeyError:
# print(lncnt)
def print(self):
opmask = ""
if(self.mask):
opmask = "{opmask}"
return(self.reg_type+str(self.size)+opmask)
if __name__ == "__main__":
# load_db()
r0 = Register("ymm0")
r1 = Register("xmm0")
r2 = Register("rax")
# create_testcase("VMOVQ", [r1,r2])
# create_testcase("VADDPD", [r0, r0, r0])
if(len(sys.argv) > 1):
for i in range(1,len(sys.argv)):
extract_instr(sys.argv[i])
print_sorted_db()
# save_db()

432
osaca.py Executable file
View File

@@ -0,0 +1,432 @@
#!/apps/python/3.5-anaconda/bin/python
import argparse
import sys
import subprocess
import os
import re
import Params
import pandas as pd
from datetime import datetime
#----------Global variables--------------
arch = ''
archList = ['SNB','IVB','HSW', 'BDW', 'SKL']
filepath = ''
srcCode = ''
marker = r'//STARTLOOP'
asm_line = re.compile(r'\s[0-9a-f]+[:]')
numSeps = 0
sem = 0
firstAppearance = True
lncnt = 0
instrForms = list()
df = ''
output = ''
horizontalSeparator = ''
total_tp = 0
longestInstr = 30
cycList = []
reciList = []
#---------------------------------------
# Check if the architecture arg is valid
def check_arch():
if(arch in archList):
return True
else:
return False
# Check if the given filepath exists and if the format is the needed elf64
def check_elffile():
if(os.path.isfile(filepath)):
create_elffile()
if('file format elf64' in srcCode[1]):
return True
return False
# Check if the given filepath exists
def check_file():
if(os.path.isfile(filepath)):
get_file()
return True
return False
# Load binary file in variable srcCode and separate by line
def create_elffile():
global srcCode
srcCode = subprocess.run(['objdump', '--source', filepath], stdout=subprocess.PIPE).stdout.decode('utf-8').split('\n')
# Load arbitrary file in variable srcCode and separate by line
def get_file():
global srcCode
try:
f = open(filepath, 'r')
except IOError:
print('IOError: file \'{}\' not found'.format(filepath))
for line in f:
srcCode += line
f.close()
srcCode = srcCode.split('\n')
def check_line(line):
global numSeps
global sem
global firstAppearance
# Check if marker is in line
if(marker in line):
# First, check if high level code in indented with whitespaces or tabs
if(firstAppearance):
set_char_counter(line)
firstAppearance = False
# Now count the number of whitespaces
numSeps = (re.split(marker, line)[0]).count(cntChar)
sem = 2
elif(sem > 0):
# We're in the marked code snippet
# Check if the line is ASM code and - if not - check if we're still in the loop
match = re.search(asm_line, line)
if(match):
# Further analysis of instructions
# Check if there are comments in line
if(r'//' in line):
return
check_instr(''.join(re.split(r'\t', line)[-1:]))
elif((re.split(r'\S', line)[0]).count(cntChar) <= numSeps):
# Not in the loop anymore - or yet. We decrement the semaphore
sem = sem-1
# Check if separators are either tabulators or whitespaces
def set_char_counter(line):
global cntChar
numSpaces = (re.split(marker, line)[0]).count(' ')
numTabs = (re.split(marker, line)[0]).count('\t')
if(numSpaces != 0 and numTabs == 0):
cntChar = ' '
elif(numSpaces == 0 and numTabs != 0):
cntChar = '\t'
else:
raise NotImplementedError('Indentation of code is only supported for whitespaces and tabs.')
def check_instr(instr):
global instrForms
global longestInstr
# Check for strange clang padding bytes
while(instr.startswith('data32')):
instr = instr[7:]
# Separate mnemonic and operands
mnemonic = instr.split()[0]
params = ''.join(instr.split()[1:])
# Check if line is not only a byte
empty_byte = re.compile(r'[0-9a-f]{2}')
if(re.match(empty_byte, mnemonic) and len(mnemonic) == 2):
return
# Check if there's one or more operands and store all in a list
param_list = flatten(separate_params(params))
param_list_types = list(param_list)
# check operands and separate them by IMMEDIATE (IMD), REGISTER (REG). MEMORY (MEM) or LABEL(LBL)
for i in range(len(param_list)):
op = param_list[i]
if(len(op) <= 0):
op = Params.Parameter('NONE')
elif(op[0] == '$'):
op = Params.Parameter('IMD')
elif(op[0] == '%' and '(' not in op):
j = len(op)
opmask = False
if('{' in op):
j = op.index('{')
opmask = True
op = Params.Register(op[1:j], opmask)
elif('<' in op):
op = Params.Parameter('LBL')
else:
op = Params.MemAddr(op)
param_list[i] = op.print()
param_list_types[i] = op
#Add to list
if(len(instr) > longestInstr):
longestInstr = len(instr)
instrForm = [mnemonic]+list(reversed(param_list_types))+[instr]
instrForms.append(instrForm)
def separate_params(params):
param_list = [params]
if(',' in params):
if(')' in params):
if(params.index(')') < len(params)-1 and params[params.index(')')+1] == ','):
i = params.index(')')+1
elif(params.index('(') < params.index(',')):
return param_list
else:
i = params.index(',')
else:
i = params.index(',')
param_list = [params[:i],separate_params(params[i+1:])]
elif('#' in params):
i = params.index('#')
param_list = [params[:i]]
return param_list
def flatten(l):
if l == []:
return l
if(isinstance(l[0], list)):
return flatten(l[0]) + flatten(l[1:])
return l[:1] + flatten(l[1:])
def read_csv():
global df
df = pd.read_csv('data/'+arch.lower()+'_throughput.csv')
def create_horiz_sep():
global horizontalSeparator
horizontalSeparator = '-'*(longestInstr+8)
def create_output():
global total_tp
global output
global longestInstr
warning = False
#Check the output alignment depending on the longest instruction
if(longestInstr > 70):
longestInstr = 70
create_horiz_sep()
ws = ' '*(len(horizontalSeparator)-23)
# Write general information about the benchmark
output = ( '--'+horizontalSeparator+'\n'
'| Analyzing of file:\t'+os.getcwd()+'/'+filepath+'\n'
'| Architecture:\t\t'+arch+'\n'
'| Timestamp:\t\t'+datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'\n'
'|\n| INSTRUCTION'+ws+'CLOCK CYCLES\n'
'| '+horizontalSeparator+'\n|\n')
# Check for the throughput data in CSV
# First determine if we're searching for the SSE, AVX or AVX512 type of instruction
for elem in instrForms:
extension = ''
avx = False
avx512 = False
opExt = []
for i in range(1, len(elem)-1):
opExt.append('r'+str(elem[i].size) if (isinstance(elem[i], Params.Register) and elem[i].reg_type == 'GPR') else elem[i].print().lower())
# Due to the fact we store the explicit operands, we don't need anyu avx/avx512 extension
# for op in elem[1:-1]:
# if(isinstance(op,Params.Register) and op.reg_type == 'YMM'):
# avx = True
# elif(isinstance(op,Params.Register) and op.reg_type == 'ZMM'):
# avx512 = True
# break
# if(avx512):
# extension = '-avx512'
# elif(avx):
# extension = '-avx'
operands = '_'.join(opExt)
# Now look up the value in the dataframe
# Check if there is a stored throughput value in database
series = df['instr'].str.contains(elem[0]+'-'+operands+'-TP')
if( True in series.values):
# It's a match!
notFound = False
try:
tp = df[df.instr == elem[0]+'-'+operands+'-TP'].clock_cycles.values[0]
except IndexError:
# Something went wrong
print('Error while fetching data from database')
continue
# Did not found the exact instruction form.
# Try to find the instruction form for register operands only
else:
opExtRegs = []
for operand in opExt:
try:
regTmp = Register(operand)
opExtRegs.append(True)
except KeyError:
opExtRegs.append(False)
pass
if(not True in opExtRegs):
# No register in whole instruction form. How can I found out what regsize we need?
print('Feature not included yet')
tp = 0
notFound = True
warning = True
continue
if(opExtRegs[0] == False):
# Instruction stores result in memory. Check for storing in register instead
if(len(opExt) > 1):
if(opExtRegs[1] == True):
opExt[0] = opExt[1]
elif(len(optExt > 2):
if(opExtRegs[2] == True):
opExt[0] = opExt[2]
if(len(opExtRegs) == 2 and opExtRegs[1] == False):
# Instruction loads value from memory and has only two operands. Check for loading from register instead
if(opExtRegs[0] == True):
opExt[1] = opExt[0]
if(len)opExtRegs) == 3 and opExtRegs[2] == False):
# Instruction loads value from memorz and has three operands. Check for loading from register instead
opExt[2] = opExt[0]
operands = '_'.join(opExt)
# Check for register equivalent instruction
series = df['instr'].str.contains(elem[0]+'-'+operands+'-TP')
if( True in series.values):
# It's a match!
notFound = False
try:
tp = df[df.instr == elem[0]+'-'+operands+'-TP'].clock_cycles.values[0]
except IndexError:
# Something went wrong
print('Error while fetching data from database')
continue
# Did not found the register instruction form. Set warning and go on with throughput 0
else:
tp = 0
notFound = True
warning = True
# Add it to the overall throughput
total_tp += tp
# Check the alignement again
numWhitespaces = longestInstr-len(elem[-1])
ws = ' '*numWhitespaces+'| '
n_f = ''
if(notFound):
n_f = ' '*(5-len(str(tp)))+'*'
data = '| '+elem[-1]+ws+str(tp)+n_f+'\n'
output += data
# Finally write the total throughput
numWhitespaces = longestInstr-27
ws = ' '+' '*numWhitespaces
output += ( '| '+horizontalSeparator+'\n'
'| TOTAL ESTIMATED THROUGHPUT:'+ws+str(total_tp))
if(warning):
output += ('\n\n* There was no throughput value found '
'for the specific instruction form.'
'\n Please create a testcase via the create_testcase-method '
'or add a value manually.')
def create_sequences():
global cycList
global reciList
for i in range(1, 101):
cycList.append(i)
reciList.append(1/i)
def validate_TP(clkC, instr):
for i in range(0, 100):
if(cycList[i]*1.05 > float(clkC) and cycList[i]*0.95 < float(clkC)):
# Value is probably correct, so round it to the estimated value
return cycList[i]
elif(reciList[i]*1.05 > float(clkC) and reciList[i]*0.95 < float(clkC)):
# Value is probably correct, so round it to the estimated value
return reciList[i]
# No value close to an integer or its reciprokal found, we assume the measurement is incorrect
print('Your measurement for {} is probably wrong. Please inspect your benchmark!'.format(instr))
print('The program will continue with the given value')
return clkC
def write_csv(csv):
try:
f = open('data/'+arch.lower()+'_throughput.csv', 'w')
except IOError:
print('IOError: file \'{}\' not found in ./data'.format(arch.lower()+'_throughput.csv'))
f.write(csv)
f.close()
##---------------main functions depending on arguments----------------------
#reads ibench output and includes it in the architecture specific csv file
def include_ibench():
global df
# Check args and exit program if something's wrong
if(not check_arch()):
print('Invalid microarchitecture.')
sys.exit()
if(not check_file()):
print('Invalid file path or file format.')
sys.exit()
# Check for database for the chosen architecture
read_csv()
# Create sequence of numbers and their reciprokals for validate the measurements
create_sequences()
print('Everything seems fine! Let\'s start checking!')
newData = []
for line in srcCode:
if('TP' in line):
# We found a command with a throughput value. Get instruction and the number of clock cycles
instr = line.split()[0][:-1]
clkC = line.split()[1]
clkC = validate_TP(clkC, instr)
tp = -1
new = False
try:
tp = df.loc[lambda df: df.instr == instr,'clock_cycles'].values[0]
except IndexError:
# Instruction not in database yet --> add it
newData.append([instr,clkC])
new = True
pass
if(not new and tp != clkC):
print('Different measurement for {}: {}(old) vs. {}(new)\nPlease check for correctness (no changes were made).'.format(instr, tp, clkC))
# Now merge the DataFrames and write new csv file
df = df.append(pd.DataFrame(newData, columns=['instr','clock_cycles']), ignore_index=True)
csv = df.to_csv(index=False)
write_csv(csv)
print('ibench output {} successfully in database included.'.format(filepath.split('/')[-1]))
# main function of the tool
def inspect_binary():
# Check args and exit program if something's wrong
if(not check_arch()):
print('Invalid microarchitecture.')
sys.exit()
if(not check_elffile()):
print('Invalid file path or file format.')
sys.exit()
# Finally check for database for the chosen architecture
read_csv()
print('Everything seems fine! Let\'s start checking!')
for line in srcCode:
lncnt += 1
check_line(line)
create_output()
print(output)
##------------------------------------------------------------------------------
##------------Main method--------------
def main():
global lncnt
global inp
global arch
global filepath
# Parse args
parser = argparse.ArgumentParser(description='Analyzes a marked innermost loop snippet for a given architecture type and prints out the estimated average throughput')
parser.add_argument('--version', '-V', action='version', version='%(prog)s 0.1')
parser.add_argument('--arch', dest='arch', type=str, help='define architecture')
parser.add_argument('filepath', type=str, help='path to object (Binary, CSV)')
parser.add_argument('--include-ibench', '-i', dest='incl', action='store_true', help='includes the given values in form of the output of ibench in the database')
# Store args in global variables
inp = parser.parse_args()
arch = inp.arch.upper()
filepath = inp.filepath
inclIbench = inp.incl
if(inclIbench):
include_ibench()
else:
inspect_binary()
##------------Main method--------------
if __name__ == '__main__':
main()

100
testcases/add-rr-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR add
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR edx, eax
INSTR r9d, ebx
INSTR r10d, ecx
INSTR r11d, eax
INSTR r12d, ebx
INSTR r13d, ecx
INSTR r14d, eax
INSTR r15d, ebx
INSTR eax, ecx
INSTR ebx, eax
INSTR ecx, ebx
INSTR edx, ecx
INSTR r9d, eax
INSTR r10d, ebx
INSTR r11d, ecx
INSTR r12d, eax
INSTR r13d, ebx
INSTR r14d, ecx
INSTR r15d, eax
INSTR eax, ebx
INSTR ebx, ecx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/add-rr.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR add
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
INSTR eax, ebx
INSTR ebx, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/cmp-rr-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR cmp
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rdx, rax
INSTR r9, rbx
INSTR r10, rcx
INSTR rdx, rax
INSTR r9, rbx
INSTR r10, rcx
INSTR r11, rax
INSTR r12, rbx
INSTR r13, rcx
INSTR r14, rax
INSTR r15, rbx
INSTR rax, rcx
INSTR rbx, rax
INSTR rcx, rbx
INSTR rdx, rcx
INSTR r9, rax
INSTR r10, rbx
INSTR r11, rcx
INSTR r12, rax
INSTR r13, rbx
INSTR r14, rcx
INSTR r15, rax
INSTR rax, rbx
INSTR rbx, rcx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/cmp-rr.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR cmp
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/dec-r-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR dec
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx
INSTR r9d
INSTR r10d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR eax
INSTR ebx
INSTR ecx
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR eax
INSTR ebx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/dec-r.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR dec
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/inc-r-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR inc
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rdx
INSTR r9
INSTR r10
INSTR rdx
INSTR r9
INSTR r10
INSTR r11
INSTR r12
INSTR r13
INSTR r14
INSTR r15
INSTR rax
INSTR rbx
INSTR rcx
INSTR rdx
INSTR r9
INSTR r10
INSTR r11
INSTR r12
INSTR r13
INSTR r14
INSTR r15
INSTR rax
INSTR rbx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/inc-r.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR inc
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,82 @@
#define INSTR janadd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rdx, eax
INSTR r9, ebx
INSTR r10, ecx
INSTR rdx, eax
INSTR r9, ebx
INSTR r10, ecx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

82
testcases/janadd-r64r32.S Normal file
View File

@@ -0,0 +1,82 @@
#define INSTR janadd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

82
testcases/janadd-rr-TP.S Normal file
View File

@@ -0,0 +1,82 @@
#define INSTR janadd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rdx, eax
INSTR r9, ebx
INSTR r10, ecx
INSTR rdx, eax
INSTR r9, ebx
INSTR r10, ecx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

82
testcases/janadd-rr.S Normal file
View File

@@ -0,0 +1,82 @@
#define INSTR janadd
#define NINST 6
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/mov-rr-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR mov
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rdx, rax
INSTR r9, rbx
INSTR r10, rcx
INSTR rdx, rax
INSTR r9, rbx
INSTR r10, rcx
INSTR r11, rax
INSTR r12, rbx
INSTR r13, rcx
INSTR r14, rax
INSTR r15, rbx
INSTR rax, rcx
INSTR rbx, rax
INSTR rcx, rbx
INSTR rdx, rcx
INSTR r9, rax
INSTR r10, rbx
INSTR r11, rcx
INSTR r12, rax
INSTR r13, rbx
INSTR r14, rcx
INSTR r15, rax
INSTR rax, rbx
INSTR rbx, rcx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/mov-rr.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR mov
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/movslq-rr-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR movslq
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rdx, eax
INSTR r9, ebx
INSTR r10, ecx
INSTR rdx, eax
INSTR r9, ebx
INSTR r10, ecx
INSTR r11, eax
INSTR r12, ebx
INSTR r13, ecx
INSTR r14, eax
INSTR r15, ebx
INSTR rax, ecx
INSTR rbx, eax
INSTR rcx, ebx
INSTR rdx, ecx
INSTR r9, eax
INSTR r10, ebx
INSTR r11, ecx
INSTR r12, eax
INSTR r13, ebx
INSTR r14, ecx
INSTR r15, eax
INSTR rax, ebx
INSTR rbx, ecx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/movslq-rr.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR movslq
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
INSTR rax, eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/movzbl-rr-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR movzbl
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx, al
INSTR r9d, bl
INSTR r10d, cl
INSTR edx, al
INSTR r9d, bl
INSTR r10d, cl
INSTR r11d, al
INSTR r12d, bl
INSTR r13d, cl
INSTR r14d, al
INSTR r15d, bl
INSTR eax, cl
INSTR ebx, al
INSTR ecx, bl
INSTR edx, cl
INSTR r9d, al
INSTR r10d, bl
INSTR r11d, cl
INSTR r12d, al
INSTR r13d, bl
INSTR r14d, cl
INSTR r15d, al
INSTR eax, bl
INSTR ebx, cl
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/movzbl-rr.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR movzbl
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
INSTR eax, al
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/neg-r-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR neg
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR edx
INSTR r9d
INSTR r10d
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR eax
INSTR ebx
INSTR ecx
INSTR edx
INSTR r9d
INSTR r10d
INSTR r11d
INSTR r12d
INSTR r13d
INSTR r14d
INSTR r15d
INSTR eax
INSTR ebx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/neg-r.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR neg
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
INSTR eax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/pop-r-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR pop
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rdx
INSTR r9
INSTR r10
INSTR rdx
INSTR r9
INSTR r10
INSTR r11
INSTR r12
INSTR r13
INSTR r14
INSTR r15
INSTR rax
INSTR rbx
INSTR rcx
INSTR rdx
INSTR r9
INSTR r10
INSTR r11
INSTR r12
INSTR r13
INSTR r14
INSTR r15
INSTR rax
INSTR rbx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/pop-r.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR pop
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
INSTR rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/sub-rr-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR sub
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rdx, rax
INSTR r9, rbx
INSTR r10, rcx
INSTR rdx, rax
INSTR r9, rbx
INSTR r10, rcx
INSTR r11, rax
INSTR r12, rbx
INSTR r13, rcx
INSTR r14, rax
INSTR r15, rbx
INSTR rax, rcx
INSTR rbx, rax
INSTR rcx, rbx
INSTR rdx, rcx
INSTR r9, rax
INSTR r10, rbx
INSTR r11, rcx
INSTR r12, rax
INSTR r13, rbx
INSTR r14, rcx
INSTR r15, rax
INSTR rax, rbx
INSTR rbx, rcx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/sub-rr.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR sub
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/test-rr-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR test
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rdx, rax
INSTR r9, rbx
INSTR r10, rcx
INSTR rdx, rax
INSTR r9, rbx
INSTR r10, rcx
INSTR r11, rax
INSTR r12, rbx
INSTR r13, rcx
INSTR r14, rax
INSTR r15, rbx
INSTR rax, rcx
INSTR rbx, rax
INSTR rcx, rbx
INSTR rdx, rcx
INSTR r9, rax
INSTR r10, rbx
INSTR r11, rcx
INSTR r12, rax
INSTR r13, rbx
INSTR r14, rcx
INSTR r15, rax
INSTR rax, rbx
INSTR rbx, rcx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/test-rr.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR test
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
INSTR rax, rbx
INSTR rbx, rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,67 @@
#define INSTR vaddpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm6, ymm0, ymm0
INSTR ymm7, ymm1, ymm1
INSTR ymm8, ymm2, ymm2
INSTR ymm9, ymm0, ymm0
INSTR ymm10, ymm1, ymm1
INSTR ymm11, ymm2, ymm2
INSTR ymm12, ymm0, ymm0
INSTR ymm13, ymm1, ymm1
INSTR ymm14, ymm2, ymm2
INSTR ymm15, ymm0, ymm0
INSTR ymm16, ymm1, ymm1
INSTR ymm17, ymm2, ymm2
INSTR ymm18, ymm0, ymm0
INSTR ymm19, ymm1, ymm1
INSTR ymm20, ymm2, ymm2
INSTR ymm21, ymm0, ymm0
INSTR ymm22, ymm1, ymm1
INSTR ymm23, ymm2, ymm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,67 @@
#define INSTR vaddpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vaddpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm16, xmm1, xmm1
INSTR xmm17, xmm2, xmm2
INSTR xmm18, xmm0, xmm0
INSTR xmm19, xmm1, xmm1
INSTR xmm20, xmm2, xmm2
INSTR xmm21, xmm0, xmm0
INSTR xmm22, xmm1, xmm1
INSTR xmm23, xmm2, xmm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vaddpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vaddsd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm16, xmm1, xmm1
INSTR xmm17, xmm2, xmm2
INSTR xmm18, xmm0, xmm0
INSTR xmm19, xmm1, xmm1
INSTR xmm20, xmm2, xmm2
INSTR xmm21, xmm0, xmm0
INSTR xmm22, xmm1, xmm1
INSTR xmm23, xmm2, xmm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vaddsd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,67 @@
#define INSTR vmovapd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0
INSTR ymm4, ymm1
INSTR ymm5, ymm2
INSTR ymm3, ymm0
INSTR ymm4, ymm1
INSTR ymm5, ymm2
INSTR ymm6, ymm0
INSTR ymm7, ymm1
INSTR ymm8, ymm2
INSTR ymm9, ymm0
INSTR ymm10, ymm1
INSTR ymm11, ymm2
INSTR ymm12, ymm0
INSTR ymm13, ymm1
INSTR ymm14, ymm2
INSTR ymm15, ymm0
INSTR ymm16, ymm1
INSTR ymm17, ymm2
INSTR ymm18, ymm0
INSTR ymm19, ymm1
INSTR ymm20, ymm2
INSTR ymm21, ymm0
INSTR ymm22, ymm1
INSTR ymm23, ymm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,67 @@
#define INSTR vmovapd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
INSTR ymm0, ymm1
INSTR ymm1, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vmovapd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0
INSTR xmm4, xmm1
INSTR xmm5, xmm2
INSTR xmm3, xmm0
INSTR xmm4, xmm1
INSTR xmm5, xmm2
INSTR xmm6, xmm0
INSTR xmm7, xmm1
INSTR xmm8, xmm2
INSTR xmm9, xmm0
INSTR xmm10, xmm1
INSTR xmm11, xmm2
INSTR xmm12, xmm0
INSTR xmm13, xmm1
INSTR xmm14, xmm2
INSTR xmm15, xmm0
INSTR xmm16, xmm1
INSTR xmm17, xmm2
INSTR xmm18, xmm0
INSTR xmm19, xmm1
INSTR xmm20, xmm2
INSTR xmm21, xmm0
INSTR xmm22, xmm1
INSTR xmm23, xmm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vmovapd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vmovaps
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0
INSTR xmm4, xmm1
INSTR xmm5, xmm2
INSTR xmm3, xmm0
INSTR xmm4, xmm1
INSTR xmm5, xmm2
INSTR xmm6, xmm0
INSTR xmm7, xmm1
INSTR xmm8, xmm2
INSTR xmm9, xmm0
INSTR xmm10, xmm1
INSTR xmm11, xmm2
INSTR xmm12, xmm0
INSTR xmm13, xmm1
INSTR xmm14, xmm2
INSTR xmm15, xmm0
INSTR xmm16, xmm1
INSTR xmm17, xmm2
INSTR xmm18, xmm0
INSTR xmm19, xmm1
INSTR xmm20, xmm2
INSTR xmm21, xmm0
INSTR xmm22, xmm1
INSTR xmm23, xmm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vmovaps
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
INSTR xmm0, xmm1
INSTR xmm1, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

98
testcases/vmovq-rxmm-TP.S Normal file
View File

@@ -0,0 +1,98 @@
#define INSTR vmovq
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR rdx, xmm0
INSTR r9, xmm1
INSTR r10, xmm2
INSTR rdx, xmm0
INSTR r9, xmm1
INSTR r10, xmm2
INSTR r11, xmm0
INSTR r12, xmm1
INSTR r13, xmm2
INSTR r14, xmm0
INSTR r15, xmm1
INSTR rax, xmm2
INSTR rbx, xmm0
INSTR rcx, xmm1
INSTR rdx, xmm2
INSTR r9, xmm0
INSTR r10, xmm1
INSTR r11, xmm2
INSTR r12, xmm0
INSTR r13, xmm1
INSTR r14, xmm2
INSTR r15, xmm0
INSTR rax, xmm1
INSTR rbx, xmm2
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

98
testcases/vmovq-rxmm.S Normal file
View File

@@ -0,0 +1,98 @@
#define INSTR vmovq
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
INSTR rax, xmm0
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/vmovq-xmmr-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR vmovq
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR xmm3, rax
INSTR xmm4, rbx
INSTR xmm5, rcx
INSTR xmm3, rax
INSTR xmm4, rbx
INSTR xmm5, rcx
INSTR xmm6, rax
INSTR xmm7, rbx
INSTR xmm8, rcx
INSTR xmm9, rax
INSTR xmm10, rbx
INSTR xmm11, rcx
INSTR xmm12, rax
INSTR xmm13, rbx
INSTR xmm14, rcx
INSTR xmm15, rax
INSTR xmm16, rbx
INSTR xmm17, rcx
INSTR xmm18, rax
INSTR xmm19, rbx
INSTR xmm20, rcx
INSTR xmm21, rax
INSTR xmm22, rbx
INSTR xmm23, rcx
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/vmovq-xmmr.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR vmovq
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
INSTR xmm0, rax
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vmovsd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm16, xmm1, xmm1
INSTR xmm17, xmm2, xmm2
INSTR xmm18, xmm0, xmm0
INSTR xmm19, xmm1, xmm1
INSTR xmm20, xmm2, xmm2
INSTR xmm21, xmm0, xmm0
INSTR xmm22, xmm1, xmm1
INSTR xmm23, xmm2, xmm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vmovsd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,67 @@
#define INSTR vmulpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm6, ymm0, ymm0
INSTR ymm7, ymm1, ymm1
INSTR ymm8, ymm2, ymm2
INSTR ymm9, ymm0, ymm0
INSTR ymm10, ymm1, ymm1
INSTR ymm11, ymm2, ymm2
INSTR ymm12, ymm0, ymm0
INSTR ymm13, ymm1, ymm1
INSTR ymm14, ymm2, ymm2
INSTR ymm15, ymm0, ymm0
INSTR ymm16, ymm1, ymm1
INSTR ymm17, ymm2, ymm2
INSTR ymm18, ymm0, ymm0
INSTR ymm19, ymm1, ymm1
INSTR ymm20, ymm2, ymm2
INSTR ymm21, ymm0, ymm0
INSTR ymm22, ymm1, ymm1
INSTR ymm23, ymm2, ymm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,67 @@
#define INSTR vmulpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vmulsd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm16, xmm1, xmm1
INSTR xmm17, xmm2, xmm2
INSTR xmm18, xmm0, xmm0
INSTR xmm19, xmm1, xmm1
INSTR xmm20, xmm2, xmm2
INSTR xmm21, xmm0, xmm0
INSTR xmm22, xmm1, xmm1
INSTR xmm23, xmm2, xmm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vmulsd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,67 @@
#define INSTR vsubpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm6, ymm0, ymm0
INSTR ymm7, ymm1, ymm1
INSTR ymm8, ymm2, ymm2
INSTR ymm9, ymm0, ymm0
INSTR ymm10, ymm1, ymm1
INSTR ymm11, ymm2, ymm2
INSTR ymm12, ymm0, ymm0
INSTR ymm13, ymm1, ymm1
INSTR ymm14, ymm2, ymm2
INSTR ymm15, ymm0, ymm0
INSTR ymm16, ymm1, ymm1
INSTR ymm17, ymm2, ymm2
INSTR ymm18, ymm0, ymm0
INSTR ymm19, ymm1, ymm1
INSTR ymm20, ymm2, ymm2
INSTR ymm21, ymm0, ymm0
INSTR ymm22, ymm1, ymm1
INSTR ymm23, ymm2, ymm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,67 @@
#define INSTR vsubpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vsubsd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm16, xmm1, xmm1
INSTR xmm17, xmm2, xmm2
INSTR xmm18, xmm0, xmm0
INSTR xmm19, xmm1, xmm1
INSTR xmm20, xmm2, xmm2
INSTR xmm21, xmm0, xmm0
INSTR xmm22, xmm1, xmm1
INSTR xmm23, xmm2, xmm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vsubsd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vunpckhpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm16, xmm1, xmm1
INSTR xmm17, xmm2, xmm2
INSTR xmm18, xmm0, xmm0
INSTR xmm19, xmm1, xmm1
INSTR xmm20, xmm2, xmm2
INSTR xmm21, xmm0, xmm0
INSTR xmm22, xmm1, xmm1
INSTR xmm23, xmm2, xmm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vunpckhpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,67 @@
#define INSTR vxorpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm3, ymm0, ymm0
INSTR ymm4, ymm1, ymm1
INSTR ymm5, ymm2, ymm2
INSTR ymm6, ymm0, ymm0
INSTR ymm7, ymm1, ymm1
INSTR ymm8, ymm2, ymm2
INSTR ymm9, ymm0, ymm0
INSTR ymm10, ymm1, ymm1
INSTR ymm11, ymm2, ymm2
INSTR ymm12, ymm0, ymm0
INSTR ymm13, ymm1, ymm1
INSTR ymm14, ymm2, ymm2
INSTR ymm15, ymm0, ymm0
INSTR ymm16, ymm1, ymm1
INSTR ymm17, ymm2, ymm2
INSTR ymm18, ymm0, ymm0
INSTR ymm19, ymm1, ymm1
INSTR ymm20, ymm2, ymm2
INSTR ymm21, ymm0, ymm0
INSTR ymm22, ymm1, ymm1
INSTR ymm23, ymm2, ymm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,67 @@
#define INSTR vxorpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# expand from SSE to AVX
vinsertf128 ymm0, ymm0, xmm0, 0x1
# copy DP 1.0
vmovaps ymm0, ymm0
vmovaps ymm1, ymm0
# Create DP 2.0
vaddpd ymm1, ymm1, ymm1
# Create DP 0.5
vdivpd ymm2, ymm0, ymm1
loop:
inc i
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
INSTR ymm0, ymm1, ymm0
INSTR ymm1, ymm0, ymm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vxorpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm16, xmm1, xmm1
INSTR xmm17, xmm2, xmm2
INSTR xmm18, xmm0, xmm0
INSTR xmm19, xmm1, xmm1
INSTR xmm20, xmm2, xmm2
INSTR xmm21, xmm0, xmm0
INSTR xmm22, xmm1, xmm1
INSTR xmm23, xmm2, xmm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vxorpd
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vxorps
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm3, xmm0, xmm0
INSTR xmm4, xmm1, xmm1
INSTR xmm5, xmm2, xmm2
INSTR xmm6, xmm0, xmm0
INSTR xmm7, xmm1, xmm1
INSTR xmm8, xmm2, xmm2
INSTR xmm9, xmm0, xmm0
INSTR xmm10, xmm1, xmm1
INSTR xmm11, xmm2, xmm2
INSTR xmm12, xmm0, xmm0
INSTR xmm13, xmm1, xmm1
INSTR xmm14, xmm2, xmm2
INSTR xmm15, xmm0, xmm0
INSTR xmm16, xmm1, xmm1
INSTR xmm17, xmm2, xmm2
INSTR xmm18, xmm0, xmm0
INSTR xmm19, xmm1, xmm1
INSTR xmm20, xmm2, xmm2
INSTR xmm21, xmm0, xmm0
INSTR xmm22, xmm1, xmm1
INSTR xmm23, xmm2, xmm2
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

View File

@@ -0,0 +1,65 @@
#define INSTR vxorps
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
# copy DP 1.0
vmovaps xmm0, xmm0
vmovaps xmm1, xmm0
# Create DP 2.0
vaddpd xmm1, xmm1, xmm1
# Create DP 0.5
vdivpd xmm2, xmm0, xmm1
loop:
inc i
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
INSTR xmm0, xmm1, xmm0
INSTR xmm1, xmm0, xmm0
cmp i, N
jl loop
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/xor-rr-TP.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR xor
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR dl, al
INSTR r9l, bl
INSTR r10l, cl
INSTR dl, al
INSTR r9l, bl
INSTR r10l, cl
INSTR r11l, al
INSTR r12l, bl
INSTR r13l, cl
INSTR r14l, al
INSTR r15l, bl
INSTR al, cl
INSTR bl, al
INSTR cl, bl
INSTR dl, cl
INSTR r9l, al
INSTR r10l, bl
INSTR r11l, cl
INSTR r12l, al
INSTR r13l, bl
INSTR r14l, cl
INSTR r15l, al
INSTR al, bl
INSTR bl, cl
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency

100
testcases/xor-rr.S Normal file
View File

@@ -0,0 +1,100 @@
#define INSTR xor
#define NINST 24
#define N edi
#define i r8d
.intel_syntax noprefix
.globl ninst
.data
ninst:
.long NINST
.text
.globl latency
.type latency, @function
.align 32
latency:
push rbp
mov rbp, rsp
xor i, i
test N, N
jle done
# create DP 1.0
vpcmpeqw xmm0, xmm0, xmm0 # all ones
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
push rax
push rbx
push rcx
push rdx
push r9
push r10
push r11
push r12
push r13
push r14
push r15
xor rax, rax
xor rbx, rbx
xor rcx, rcx
xor rdx, rdx
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
# copy DP 1.0
vmovq rax, xmm0
vmovq rbx, xmm0
# Create DP 2.0
add rbx, rax
# Create DP 0.5
div rax
movq rcx, rax
vmovq rax, xmm0
loop:
inc i
INSTR al, bl
INSTR bl, al
INSTR al, bl
INSTR bl, al
INSTR al, bl
INSTR bl, al
INSTR al, bl
INSTR bl, al
INSTR al, bl
INSTR bl, al
INSTR al, bl
INSTR bl, al
INSTR al, bl
INSTR bl, al
INSTR al, bl
INSTR bl, al
INSTR al, bl
INSTR bl, al
INSTR al, bl
INSTR bl, al
INSTR al, bl
INSTR bl, al
INSTR al, bl
INSTR bl, al
cmp i, N
jl loop
pop r15
pop r14
pop r13
pop r12
pop r11
pop r10
pop r9
pop rdx
pop rcx
pop rbx
pop rax
done:
mov rsp, rbp
pop rbp
ret
.size latency, .-latency