mirror of
https://github.com/RRZE-HPC/OSACA.git
synced 2025-12-15 16:40:05 +01:00
initial upload
This commit is contained in:
103
Params.py
Executable file
103
Params.py
Executable file
@@ -0,0 +1,103 @@
|
||||
#!/apps/python/3.5-anaconda/bin/python
|
||||
class Parameter(object):
|
||||
type_list = ["REG", "MEM", "IMD", "LBL", "NONE"]
|
||||
def __init__(self, ptype, name=""):
|
||||
self.ptype = ptype.upper()
|
||||
if(self.ptype not in self.type_list):
|
||||
raise NameError("Type not supported: "+ptype)
|
||||
|
||||
def print(self):
|
||||
if(self.ptype == "NONE"):
|
||||
return ""
|
||||
else:
|
||||
return self.ptype
|
||||
|
||||
class MemAddr(Parameter):
|
||||
segment_regs = ["CS", "DS", "SS", "ES", "FS", "GS"]
|
||||
scales = [1, 2, 4, 8]
|
||||
def __init__(self, name):
|
||||
self.sreg = False
|
||||
self.offset = False
|
||||
self.base = False
|
||||
self.index = False
|
||||
self.scale = False
|
||||
if(':' in name):
|
||||
if(name[1:name.index(':')].upper() not in self.segment_regs):
|
||||
raise NameError("Type not supported: "+name)
|
||||
self.sreg = True
|
||||
self.offset = True
|
||||
if('(' not in name or ('(' in name and name.index('(') != 0)):
|
||||
self.offset = True
|
||||
if('(' in name):
|
||||
self.parentheses = name[name.index('(')+1:-1]
|
||||
self.commacnt = self.parentheses.count(',')
|
||||
if(self.commacnt == 0):
|
||||
self.base = True
|
||||
elif(self.commacnt == 2 and int(self.parentheses[-1:]) in self.scales):
|
||||
self.base = True
|
||||
self.index = True
|
||||
self.scale = True
|
||||
else:
|
||||
raise NameError("Type not supported: "+name)
|
||||
|
||||
def print(self):
|
||||
self.mem_format = "MEM("
|
||||
if(self.sreg):
|
||||
self.mem_format += "sreg:"
|
||||
if(self.offset):
|
||||
self.mem_format += "offset"
|
||||
if(self.base and not self.index):
|
||||
self.mem_format += "(base)"
|
||||
elif(self.base and self.index and self.scale):
|
||||
self.mem_format += "(base, index, scale)"
|
||||
self.mem_format += ")"
|
||||
return self.mem_format
|
||||
|
||||
|
||||
|
||||
class Register(Parameter):
|
||||
sizes = {
|
||||
#General Purpose Registers
|
||||
"AH":(8,"GPR"), "AL":(8,"GPR"), "BH":(8,"GPR"), "BL":(8,"GPR"), "CH":(8,"GPR"), "CL":(8,"GPR"), "DH":(8,"GPR"), "DL":(8,"GPR"), "BPL":(8,"GPR"), "SIL":(8,"GPR"), "DIL":(8,"GPR"), "SPL":(8,"GPR"), "R8L":(8,"GPR"), "R9L":(8,"GPR"), "R10L":(8,"GPR"), "R11L":(8,"GPR"), "R12L":(8,"GPR"), "R13L":(8,"GPR"), "R14L":(8,"GPR"), "R15L":(8,"GPR"),
|
||||
"R8B":(8,"GPR"),"R9B":(8,"GPR"),"R10B":(8,"GPR"),"R11B":(8,"GPR"),"R12B":(8,"GPR"),"R13B":(8,"GPR"),"R14B":(8,"GPR"),"R15B":(8,"GPR"),
|
||||
"AX":(16,"GPR"), "BC":(16,"GPR"), "CX":(16,"GPR"), "DX":(16,"GPR"), "BP":(16,"GPR"), "SI":(16,"GPR"), "DI":(16,"GPR"), "SP":(16,"GPR"), "R8W":(16,"GPR"), "R9W":(16,"GPR"), "R10W":(16,"GPR"), "R11W":(16,"GPR"), "R12W":(16,"GPR"), "R13W":(16,"GPR"), "R14W":(16,"GPR"), "R15W":(16,"GPR"),
|
||||
"EAX":(32,"GPR"), "EBX":(32,"GPR"), "ECX":(32,"GPR"), "EDX":(32,"GPR"), "EBP":(32,"GPR"), "ESI":(32,"GPR"), "EDI":(32,"GPR"), "ESP":(32,"GPR"), "R8D":(32,"GPR"), "R9D":(32,"GPR"), "R10D":(32,"GPR"), "R11D":(32,"GPR"), "R12D":(32,"GPR"), "R13D":(32,"GPR"), "R14D":(32,"GPR"), "R15D":(32,"GPR"),
|
||||
"RAX":(64,"GPR"), "RBX":(64,"GPR"), "RCX":(64,"GPR"), "RDX":(64,"GPR"), "RBP":(64,"GPR"), "RSI":(64,"GPR"), "RDI":(64,"GPR"), "RSP":(64,"GPR"), "R8":(64,"GPR"), "R9":(64,"GPR"), "R10":(64,"GPR"), "R11":(64,"GPR"), "R12":(64,"GPR"), "R13":(64,"GPR"), "R14":(64,"GPR"), "R15":(64,"GPR"),
|
||||
"CS":(16,"GPR"), "DS":(16,"GPR"), "SS":(16,"GPR"), "ES":(16,"GPR"), "FS":(16,"GPR"), "GS":(16,"GPR"),
|
||||
"EFLAGS":(32,"GPR"), "RFLAGS":(64,"GPR"), "EIP":(32,"GPR"), "RIP":(64,"GPR"),
|
||||
#FPU Registers
|
||||
"ST0":(80,"FPU"),"ST1":(80,"FPU"),"ST2":(80,"FPU"),"ST3":(80,"FPU"),"ST4":(80,"FPU"),"ST5":(80,"FPU"),"ST6":(80,"FPU"),"ST7":(80,"FPU"),
|
||||
#MMX Registers
|
||||
"MM0":(64,"MMX"),"MM1":(64,"MMX"),"MM2":(64,"MMX"),"MM3":(64,"MMX"),"MM4":(64,"MMX"),"MM5":(64,"MMX"),"MM6":(64,"MMX"),"MM7":(64,"MMX"),
|
||||
#XMM Registers
|
||||
"XMM0":(128,"XMM"),"XMM1":(128,"XMM"),"XMM2":(128,"XMM"),"XMM3":(128,"XMM"),"XMM4":(128,"XMM"),"XMM5":(128,"XMM"),"XMM6":(128,"XMM"),"XMM7":(128,"XMM"), "XMM8":(128,"XMM"), "XMM9":(128,"XMM"), "XMM10":(128,"XMM"), "XMM11":(128,"XMM"), "XMM12":(128,"XMM"), "XMM13":(128,"XMM"), "XMM14":(128,"XMM"), "XMM15":(128,"XMM"), "XMM16":(128,"XMM"), "XMM17":(128,"XMM"), "XMM18":(128,"XMM"), "XMM19":(128,"XMM"), "XMM20":(128,"XMM"), "XMM21":(128,"XMM"), "XMM22":(128,"XMM"), "XMM23":(128,"XMM"), "XMM24":(128,"XMM"), "XMM25":(128,"XMM"), "XMM26":(128,"XMM"), "XMM27":(128,"XMM"), "XMM28":(128,"XMM"), "XMM29":(128,"XMM"), "XMM30":(128,"XMM"), "XMM31":(128,"XMM"),
|
||||
#YMM Registers
|
||||
"YMM0":(256,"YMM"),"YMM1":(256,"YMM"),"YMM2":(256,"YMM"),"YMM3":(256,"YMM"),"YMM4":(256,"YMM"),"YMM5":(256,"YMM"),"YMM6":(256,"YMM"),"YMM7":(256,"YMM"), "YMM8":(256,"YMM"), "YMM9":(256,"YMM"), "YMM10":(256,"YMM"), "YMM11":(256,"YMM"), "YMM12":(256,"YMM"), "YMM13":(256,"YMM"), "YMM14":(256,"YMM"), "YMM15":(256,"YMM"), "YMM16":(256,"YMM"), "YMM17":(256,"YMM"), "YMM18":(256,"YMM"), "YMM19":(256,"YMM"), "YMM20":(256,"YMM"), "YMM21":(256,"YMM"), "YMM22":(256,"YMM"), "YMM23":(256,"YMM"), "YMM24":(256,"YMM"), "YMM25":(256,"YMM"), "YMM26":(256,"YMM"), "YMM27":(256,"YMM"), "YMM28":(256,"YMM"), "YMM29":(256,"YMM"), "YMM30":(256,"YMM"), "YMM31":(256,"YMM"),
|
||||
#ZMM Registers
|
||||
"ZMM0":(512,"ZMM"),"ZMM1":(512,"ZMM"),"ZMM2":(512,"ZMM"),"ZMM3":(512,"ZMM"),"ZMM4":(512,"ZMM"),"ZMM5":(512,"ZMM"),"ZMM6":(512,"ZMM"),"ZMM7":(512,"ZMM"), "ZMM8":(512,"ZMM"), "ZMM9":(512,"ZMM"), "ZMM10":(512,"ZMM"), "ZMM11":(512,"ZMM"), "ZMM12":(512,"ZMM"), "ZMM13":(512,"ZMM"), "ZMM14":(512,"ZMM"), "ZMM15":(512,"ZMM"), "ZMM16":(512,"ZMM"), "ZMM17":(512,"ZMM"), "ZMM18":(512,"ZMM"), "ZMM19":(512,"ZMM"), "ZMM20":(512,"ZMM"), "ZMM21":(512,"ZMM"), "ZMM22":(512,"ZMM"), "ZMM23":(512,"ZMM"), "ZMM24":(512,"ZMM"), "ZMM25":(512,"ZMM"), "ZMM26":(512,"ZMM"), "ZMM27":(512,"ZMM"), "ZMM28":(512,"ZMM"), "ZMM29":(512,"ZMM"), "ZMM30":(512,"ZMM"), "ZMM31":(512,"ZMM"),
|
||||
#Opmask Register
|
||||
"K0":(64,"K"), "K1":(64,"K"), "K2":(64,"K"), "K3":(64,"K"), "K4":(64,"K"), "K5":(64,"K"), "K6":(64,"K"), "K7":(64,"K"),
|
||||
#Bounds Registers
|
||||
"BND0":(128,"BND"),"BND1":(128,"BND"),"BND2":(128,"BND"),"BND3":(128,"BND")
|
||||
#Registers in gerneral
|
||||
"GPR8":(8,"GPR"), "GPR16":(16,"GPR"), "GPR32":(32,"GPR"), "GPR64":(64,"GPR"), "FPU":(80,"FPU"), "MMX":(64,"MMX"), "XMM":(128,"XMM"), "YMM":(256,"YMM"), "ZMM":(512,"ZMM"), "K":(64,"K"), "BND":(128,"BND")
|
||||
}
|
||||
|
||||
def __init__(self,name,mask=False):
|
||||
self.name = name.upper()
|
||||
self.mask = mask
|
||||
# try:
|
||||
if[name in self.sizes]:
|
||||
self.size = self.sizes[self.name][0]
|
||||
self.reg_type = self.sizes[self.name][1]
|
||||
else:
|
||||
print(lncnt)
|
||||
raise NameError("Register name not in dictionary: "+self.name)
|
||||
# except KeyError:
|
||||
# print(lncnt)
|
||||
|
||||
def print(self):
|
||||
opmask = ""
|
||||
if(self.mask):
|
||||
opmask = "{opmask}"
|
||||
return(self.reg_type+opmask)
|
||||
255
Testcase.py
Executable file
255
Testcase.py
Executable file
@@ -0,0 +1,255 @@
|
||||
#!/apps/python/3.5-anaconda/bin/python
|
||||
|
||||
import os
|
||||
from subprocess import call
|
||||
from math import ceil
|
||||
from Params import Register
|
||||
|
||||
class Testcase(object):
|
||||
|
||||
##------------------Constant variables--------------------------
|
||||
# Lookup tables for regs
|
||||
gprs64 = ['rax', 'rbx', 'rcx', 'rdx', 'r9', 'r10', 'r11', 'r12', 'r13', 'r14', 'r15']
|
||||
gprs32 = ['eax', 'ebx', 'ecx', 'edx', 'r9d', 'r10d', 'r11d', 'r12d', 'r13d', 'r14d', 'r15d']
|
||||
gprs16 = ['ax', 'bx', 'cx', 'dx', 'r9w', 'r10w', 'r11w', 'r12w', 'r13w', 'r14w', 'r15w']
|
||||
gprs8 = ['al', 'bl', 'cl', 'dl', 'r9l', 'r10l', 'r11l', 'r12l', 'r13l', 'r14l', 'r15l']
|
||||
fpus = ['st0', 'st1', 'st2', 'st3', 'st4', 'st5', 'st6', 'st7']
|
||||
mmxs = ['mm0', 'mm1', 'mm2', 'mm3', 'mm4', 'mm5', 'mm6', 'mm7']
|
||||
ks = ['k0', 'k1', 'k2', 'k3', 'k4', 'k5', 'k6', 'k7']
|
||||
bnds = ['bnd0', 'bnd1', 'bnd2', 'bnd3', 'bnd4', 'bnd5', 'bnd6', 'bnd7']
|
||||
xmms = ['xmm0', 'xmm1', 'xmm2', 'xmm3', 'xmm4', 'xmm5', 'xmm6', 'xmm7', 'xmm8', 'xmm9',
|
||||
'xmm10', 'xmm11', 'xmm12', 'xmm13', 'xmm14', 'xmm15', 'xmm16', 'xmm17', 'xmm18', 'xmm19',
|
||||
'xmm20', 'xmm21', 'xmm22', 'xmm23', 'xmm24', 'xmm25', 'xmm26', 'xmm27', 'xmm28', 'xmm29',
|
||||
'xmm30', 'xmm31']
|
||||
ymms = ['ymm0', 'ymm1', 'ymm2', 'ymm3', 'ymm4', 'ymm5', 'ymm6', 'ymm7', 'ymm8', 'ymm9',
|
||||
'ymm10', 'ymm11', 'ymm12', 'ymm13', 'ymm14', 'ymm15', 'ymm16', 'ymm17', 'ymm18', 'ymm19',
|
||||
'ymm20', 'ymm21', 'ymm22', 'ymm23', 'ymm24', 'ymm25', 'ymm26', 'ymm27', 'ymm28', 'ymm29',
|
||||
'ymm30', 'ymm31']
|
||||
zmms = ['zmm0', 'zmm1', 'zmm2', 'zmm3', 'zmm4', 'zmm5', 'zmm6', 'zmm7', 'zmm8', 'zmm9',
|
||||
'zmm10', 'zmm11', 'zmm12', 'zmm13', 'zmm14', 'zmm15', 'zmm16', 'zmm17', 'zmm18', 'zmm19',
|
||||
'zmm20', 'zmm21', 'zmm22', 'zmm23', 'zmm24', 'zmm25', 'zmm26', 'zmm27', 'zmm28', 'zmm29',
|
||||
'zmm30', 'zmm31']
|
||||
|
||||
ops = {'gpr64':gprs64, 'gpr32':gprs32, 'gpr16':gprs16, 'gpr8':gprs8, 'fpu':fpus, 'mmx':mmxs, 'k':ks, 'bnd':bnds, 'xmm':xmms, 'ymm':ymms, 'zmm':zmms}
|
||||
|
||||
# Create Single Precision 1.0
|
||||
sp1 = '\t\t# create SP 1.0\n'
|
||||
sp1 += '\t\tvpcmpeqw xmm0, xmm0, xmm0\n'
|
||||
sp1 += '\t\tvpslld xmm0, xmm0, 25\t\t\t# logical left shift: 11111110..0 (25=32-(8-1))\n'
|
||||
sp1 += '\t\tvpsrld xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading mantissa bit is zero\n'
|
||||
sp1 += '\t\t# copy SP 1.0\n'
|
||||
# Create Double Precision 1.0
|
||||
dp1 = '\t\t# create DP 1.0\n'
|
||||
dp1 += '\t\tvpcmpeqw xmm0, xmm0, xmm0\t\t# all ones\n'
|
||||
dp1 += '\t\tvpsllq xmm0, xmm0, 54\t\t\t# logical left shift: 11111110..0 (54=64-(10-1))\n'
|
||||
dp1 += '\t\tvpsrlq xmm0, xmm0, 2\t\t\t# logical right shift: 1 bit for sign; leading mantissa bit is zero\n'
|
||||
# Create epilogue
|
||||
done = ('done:\n'
|
||||
'\t\tmov\trsp, rbp\n'
|
||||
'\t\tpop\trbp\n'
|
||||
'\t\tret\n'
|
||||
'.size latency, .-latency')
|
||||
##----------------------------------------------------------------
|
||||
|
||||
# Constructor
|
||||
def __init__(self, _mnemonic, _param_list, _num_instr='12'):
|
||||
self.instr = _mnemonic.lower()
|
||||
self.param_list = _param_list
|
||||
# num_instr must be an even number
|
||||
self.num_instr = str(ceil(int(_num_instr)/2)*2)
|
||||
# Check for the number of operands and initialise the GPRs if necessary
|
||||
self.reg_a, self.reg_b, self.reg_c, self.gprPush, self.gprPop, self.zeroGPR, self.copy = self.__define_regs()
|
||||
self.num_regs = len(self.param_list)
|
||||
|
||||
# Create asm header
|
||||
self.def_instr, self.ninstr, self.init, self.expand = self.__define_header()
|
||||
# Create latency and throughput loop
|
||||
self.loop_lat = self.__define_loop_lat()
|
||||
self.loop_thrpt = self.__define_loop_thrpt()
|
||||
|
||||
|
||||
def write_testcase(self):
|
||||
regs = self.param_list
|
||||
extension = ''
|
||||
# Add operands
|
||||
extension += ('-'+(self.reg_a if ('gpr' not in self.reg_a) else 'r'+self.reg_a[3:]) + ('_') +
|
||||
(self.reg_b if ('gpr' not in self.reg_b) else 'r'+self.reg_b[3:]) + ('_') +
|
||||
(self.reg_c if ('gpr' not in self.reg_c) else 'r'+self.reg_c[3:]))
|
||||
# Write latency file
|
||||
call(['mkdir', '-p', 'testcases'])
|
||||
f = open('./testcases/'+self.instr+extension+'.S', 'w')
|
||||
data = (self.def_instr+self.ninstr+self.init+self.dp1+self.expand+self.gprPush+self.zeroGPR+self.copy+self.loop_lat+self.gprPop+self.done)
|
||||
f.write(data)
|
||||
f.close()
|
||||
# Write throughput file
|
||||
f = open('./testcases/'+self.instr+extension+'-TP.S', 'w')
|
||||
data = (self.def_instr+self.ninstr+self.init+self.dp1+self.expand+self.gprPush+self.zeroGPR+self.copy+self.loop_thrpt+self.gprPop+self.done)
|
||||
f.write(data)
|
||||
f.close()
|
||||
|
||||
|
||||
# Check register
|
||||
def __define_regs(self):
|
||||
regs = self.param_list
|
||||
reg_a, reg_b, reg_c = ('', '', '')
|
||||
gprPush, gprPop, zeroGPR = ('', '', '')
|
||||
reg_a = regs[0].reg_type.lower()
|
||||
if(reg_a == 'gpr'):
|
||||
gprPush, gprPop, zeroGPR = self.__initialise_gprs()
|
||||
reg_a += str(regs[0].size)
|
||||
if(len(regs) > 1):
|
||||
reg_b = regs[1].reg_type.lower()
|
||||
if(reg_b == 'gpr'):
|
||||
reg_b += str(regs[1].size)
|
||||
if('gpr' not in reg_a):
|
||||
gprPush, gprPop, zeroGPR = self.__initialise_gprs()
|
||||
if(len(regs) == 3):
|
||||
reg_c = regs[2].reg_type.lower()
|
||||
if(reg_c == 'gpr'):
|
||||
reg_c += str(regs[2].size)
|
||||
if(('gpr' not in reg_a) and ('gpr'not in reg_b)):
|
||||
gprPush, gprPop, zeroGPR = self.__initialise_gprs()
|
||||
if(len(regs) == 1):
|
||||
copy = self.__copy_regs(regs[0])
|
||||
else:
|
||||
copy = self.__copy_regs(regs[1])
|
||||
return (reg_a, reg_b, reg_c, gprPush, gprPop, zeroGPR, copy)
|
||||
|
||||
|
||||
# Initialise 11 general purpose registers and set them to zero
|
||||
def __initialise_gprs(self):
|
||||
gprPush = ''
|
||||
gprPop = ''
|
||||
zeroGPR = ''
|
||||
for reg in self.gprs64:
|
||||
gprPush += '\t\tpush {}\n'.format(reg)
|
||||
for reg in reversed(self.gprs64):
|
||||
gprPop += '\t\tpop {}\n'.format(reg)
|
||||
for reg in self.gprs64:
|
||||
zeroGPR += '\t\txor {}, {}\n'.format(reg, reg)
|
||||
return (gprPush, gprPop, zeroGPR)
|
||||
|
||||
|
||||
# Copy created values in specific register
|
||||
def __copy_regs(self, reg):
|
||||
copy = '\t\t# copy DP 1.0\n'
|
||||
# Different handling for GPR, MMX and SSE/AVX registers
|
||||
if(reg.reg_type == 'GPR'):
|
||||
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
|
||||
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][1])
|
||||
copy += '\t\t# Create DP 2.0\n'
|
||||
copy += '\t\tadd {}, {}\n'.format(self.ops['gpr64'][1], self.ops['gpr64'][0])
|
||||
copy += '\t\t# Create DP 0.5\n'
|
||||
copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0])
|
||||
copy += '\t\tmovq {}, {}\n'.format(self.ops['gpr64'][2], self.ops['gpr64'][0])
|
||||
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
|
||||
elif(reg.reg_type == 'MMX'):
|
||||
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][0])
|
||||
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['mmx'][1])
|
||||
copy += '\t\tvmovq {}, xmm0\n'.format(self.ops['gpr64'][0])
|
||||
copy += '\t\t# Create DP 2.0\n'
|
||||
copy += '\t\tadd {}, {}\n'.format(ops['mmx'][1], ops['mmx'][0])
|
||||
copy += '\t\t# Create DP 0.5\n'
|
||||
copy += '\t\tdiv {}\n'.format(self.ops['gpr64'][0])
|
||||
copy += '\t\tmovq {}, {}\n'.format(self.ops['mmx'][2], self.ops['gpr64'][0])
|
||||
elif(reg.reg_type == 'XMM' or reg.reg_type == 'YMM' or reg.reg_type == 'ZMM'):
|
||||
key = reg.reg_type.lower()
|
||||
copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][0], self.ops[key][0])
|
||||
copy += '\t\tvmovaps {}, {}\n'.format(self.ops[key][1], self.ops[key][0])
|
||||
copy += '\t\t# Create DP 2.0\n'
|
||||
copy += '\t\tvaddpd {}, {}, {}\n'.format(self.ops[key][1], self.ops[key][1], self.ops[key][1])
|
||||
copy += '\t\t# Create DP 0.5\n'
|
||||
copy += '\t\tvdivpd {}, {}, {}\n'.format(self.ops[key][2], self.ops[key][0], self.ops[key][1])
|
||||
else:
|
||||
copy = ''
|
||||
return copy
|
||||
|
||||
|
||||
def __define_header(self):
|
||||
def_instr = '#define INSTR '+self.instr+'\n'
|
||||
ninstr = '#define NINST '+self.num_instr+'\n'
|
||||
init = ('#define N edi\n' \
|
||||
'#define i r8d\n\n\n'
|
||||
'.intel_syntax noprefix\n'
|
||||
'.globl ninst\n'
|
||||
'.data\n'
|
||||
'ninst:\n'
|
||||
'.long NINST\n'
|
||||
'.text\n'
|
||||
'.globl latency\n'
|
||||
'.type latency, @function\n'
|
||||
'.align 32\n'
|
||||
'latency:\n'
|
||||
'\t\tpush\trbp\n'
|
||||
'\t\tmov\trbp, rsp\n'
|
||||
'\t\txor\ti, i\n'
|
||||
'\t\ttest\tN, N\n'
|
||||
'\t\tjle\tdone\n')
|
||||
# Expand to AVX(512) if necessary
|
||||
expand = ''
|
||||
if(self.reg_a == 'ymm' or self.reg_b == 'ymm' or self.reg_c == 'ymm'):
|
||||
expand = ('\t\t# expand from SSE to AVX\n'
|
||||
'\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n')
|
||||
if(self.reg_a == 'zmm' or self.reg_b == 'zmm' or self.reg_c == 'zmm'):
|
||||
expand = ('\t\t# expand from SSE to AVX\n'
|
||||
'\t\tvinsertf128 ymm0, ymm0, xmm0, 0x1\n'
|
||||
'\t\t# expand from AVX to AVX512\n'
|
||||
'\t\tvinsert64x4 zmm0, zmm0, ymm0, 0x1\n')
|
||||
return (def_instr, ninstr, init, expand)
|
||||
|
||||
# Create latency loop
|
||||
def __define_loop_lat(self):
|
||||
loop_lat = ('loop:\n'
|
||||
'\t\tinc i\n')
|
||||
if(self.num_regs == 1):
|
||||
for i in range(0, int(self.num_instr)):
|
||||
loop_lat += '\t\tINSTR {}\n'.format(self.ops[self.reg_a][0])
|
||||
elif(self.num_regs == 2 and self.reg_a == self.reg_b):
|
||||
for i in range(0, int(self.num_instr), 2):
|
||||
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][1])
|
||||
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.reg_b][1], self.ops[self.reg_b][0])
|
||||
elif(self.num_regs == 2 and self.reg_a != self.reg_b):
|
||||
for i in range(0, int(self.num_instr), 2):
|
||||
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][0])
|
||||
loop_lat += '\t\tINSTR {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][0])
|
||||
elif(self.num_regs == 3 and self.reg_a == self.reg_b):
|
||||
for i in range(0, int(self.num_instr), 2):
|
||||
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][1], self.ops[self.reg_c][0])
|
||||
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.reg_a][1], self.ops[self.reg_b][0], self.ops[self.reg_c][0])
|
||||
elif(self.num_regs == 3 and self.reg_a == self.reg_c):
|
||||
for i in range(0, int(self.num_instr), 2):
|
||||
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.reg_a][0], self.ops[self.reg_b][0], self.ops[self.reg_c][0])
|
||||
loop_lat += '\t\tINSTR {}, {}, {}\n'.format(self.ops[self.reg_a][1], self.ops[self.reg_b][0], self.ops[self.reg_c][0])
|
||||
loop_lat += ('\t\tcmp i, N\n'
|
||||
'\t\tjl loop\n')
|
||||
return loop_lat
|
||||
|
||||
# Create throughput loop
|
||||
def __define_loop_thrpt(self):
|
||||
loop_thrpt = ('loop:\n'
|
||||
'\t\tinc i\n')
|
||||
ext = ''
|
||||
ext1 = False
|
||||
ext2 = False
|
||||
if(self.num_regs == 2):
|
||||
ext1 = True
|
||||
if(self.num_regs == 3):
|
||||
ext1 = True
|
||||
ext2 = True
|
||||
for i in range(0, int(self.num_instr)):
|
||||
if(ext1):
|
||||
ext = ', {}'.format(self.ops[self.reg_b][i%3])
|
||||
if(ext2):
|
||||
ext += ', {}'.format(self.ops[self.reg_c][i%3])
|
||||
regNum = i%len(self.ops[self.reg_a]) if (i > 2) else (i+3)%len(self.ops[self.reg_a])
|
||||
loop_thrpt += '\t\tINSTR {}{}\n'.format(self.ops[self.reg_a][regNum], ext)
|
||||
loop_thrpt += ('\t\tcmp i, N\n'
|
||||
'\t\tjl loop\n')
|
||||
return loop_thrpt
|
||||
|
||||
|
||||
def __is_in_dir(self, name, path):
|
||||
for root, dirs, files in os.walk(path):
|
||||
if name in files:
|
||||
return True
|
||||
return False
|
||||
42
data/ivb_throughput.csv
Normal file
42
data/ivb_throughput.csv
Normal file
@@ -0,0 +1,42 @@
|
||||
instr,clock_cycles
|
||||
vmovapd-TP,0.84
|
||||
vaddsd-TP,1.016
|
||||
inc-TP,0.446
|
||||
cmp-TP,0.447
|
||||
inc-rrxmm-TP,0.446
|
||||
cmp-rrxmm-TP,0.446
|
||||
vmovq-TP,1.17
|
||||
vmovsd-TP,1.17
|
||||
xor-TP,0.336
|
||||
vxorpd-avx-TP,0.335
|
||||
vmovq-rxmmxmm-TP,1.004
|
||||
vxorps-TP,0.336
|
||||
vunpckhpd-TP,1.177
|
||||
test-TP,0.446
|
||||
vmulsd-TP,1.0170000000000001
|
||||
test-rrxmm-TP,0.446
|
||||
add-TP,0.47200000000000003
|
||||
neg-TP,0.447
|
||||
add-rrxmm-TP,0.47100000000000003
|
||||
mov-TP,0.386
|
||||
mov-rrxmm-TP,0.37
|
||||
vaddpd-avx-TP,1.016
|
||||
xor-rrxmm-TP,0.336
|
||||
sub-TP,0.335
|
||||
sub-rrxmm-TP,0.336
|
||||
vxorpd-TP,0.336
|
||||
vmovapd-avx-TP,0.8370000000000001
|
||||
vmulpd-avx-TP,1.021
|
||||
vsubsd-TP,1.014
|
||||
vmovaps-TP,0.836
|
||||
vaddpd-TP,1.015
|
||||
vsubpd-avx-TP,1.014
|
||||
dec-TP,0.447
|
||||
lea-TP,0.5
|
||||
jb-TP,0.447
|
||||
vmulss-xmmxmmxmm-TP,1.0
|
||||
vaddss-xmmxmmxmm-TP,1.0
|
||||
vcvtsi2ss-xmmxmmr-TP,1.0859999999999999
|
||||
xor-rr-TP,0.413
|
||||
vxorps-xmmxmmxmm-TP,0.3333333333333333
|
||||
inc-rxmmxmm-TP,0.390
|
||||
|
339
get_instr.py
Executable file
339
get_instr.py
Executable file
@@ -0,0 +1,339 @@
|
||||
#!/apps/python/3.5-anaconda/bin/python
|
||||
import sys
|
||||
import re
|
||||
from Testcase import *
|
||||
|
||||
marker = r'//STARTLOOP'
|
||||
asm_line = re.compile(r'\s[0-9a-f]+[:]')
|
||||
numSeps = 0
|
||||
sem = 0
|
||||
db = {}
|
||||
sorted_db = []
|
||||
lncnt = 1
|
||||
#cnt=0
|
||||
fname = ""
|
||||
cntChar = ''
|
||||
first = True
|
||||
|
||||
def extract_instr(asmFile):
|
||||
global once
|
||||
global lncnt
|
||||
global fname
|
||||
fname = asmFile
|
||||
#Check if parameter is in the correct file format
|
||||
if(asmFile[-4:] != ".log"):
|
||||
print("Invalid argument")
|
||||
sys.exit()
|
||||
#Open file
|
||||
try:
|
||||
f=open(asmFile, "r")
|
||||
except IOError:
|
||||
print("IOError: File not found")
|
||||
#Analyse code line by line and check the instructions
|
||||
lncnt = 1
|
||||
for line in f:
|
||||
check_line(line)
|
||||
lncnt += 1
|
||||
f.close()
|
||||
|
||||
|
||||
def check_line(line):
|
||||
global numSeps
|
||||
global sem
|
||||
global first
|
||||
#Check if marker is in line and count the number of whitespaces if so
|
||||
if(marker in line):
|
||||
#But first, check if high level code ist indented with whitespaces or tabs
|
||||
if(first):
|
||||
set_counter_char(line)
|
||||
first = False
|
||||
numSeps = (re.split(marker,line)[0]).count(cntChar)
|
||||
sem = 2;
|
||||
elif(sem > 0):
|
||||
#We're in the marked code snipped
|
||||
#Check if the line is ASM code and - if not - check if we're still in the loop
|
||||
match = re.search(asm_line, line)
|
||||
if(match):
|
||||
#Further analysis of instructions
|
||||
# print("".join(re.split(r'\t',line)[-1:]),end="")
|
||||
#Check if there are commetns in line
|
||||
if(r'//' in line):
|
||||
return
|
||||
check_instr("".join(re.split(r'\t',line)[-1:]))
|
||||
elif((re.split(r'\S',line)[0]).count(cntChar) <= numSeps):
|
||||
#Not in the loop anymore - or yet - so we decrement the semaphore
|
||||
sem = sem-1
|
||||
|
||||
#Check if seperator is either tabulator or whitespace
|
||||
def set_counter_char(line):
|
||||
global cntChar
|
||||
numSpaces = (re.split(marker,line)[0]).count(" ")
|
||||
numTabs = (re.split(marker,line)[0]).count("\t")
|
||||
if(numSpaces != 0 and numTabs == 0):
|
||||
cntChar = ' '
|
||||
elif(numSpaces == 0 and numTabs != 0):
|
||||
cntChar = '\t'
|
||||
else:
|
||||
raise NotImplementedError("Indentation of code is only supported for whitespaces and tabs.")
|
||||
|
||||
|
||||
def check_instr(instr):
|
||||
global db
|
||||
global lncnt
|
||||
global cnt
|
||||
global fname
|
||||
#Check for strange clang padding bytes
|
||||
while(instr.startswith("data32")):
|
||||
instr = instr[7:]
|
||||
#Seperate mnemonic and operands
|
||||
mnemonic = instr.split()[0]
|
||||
params = "".join(instr.split()[1:])
|
||||
#Check if line is not only a byte
|
||||
empty_byte = re.compile(r'[0-9a-f]{2}')
|
||||
if(re.match(empty_byte, mnemonic) and len(mnemonic) == 2):
|
||||
return
|
||||
#Check if there's one or more operand and store all in a list
|
||||
param_list = flatten(separate_params(params))
|
||||
regList = list(param_list)
|
||||
#Check operands and seperate them by IMMEDIATE (IMD), REGISTER (REG), MEMORY (MEM) or LABEL (LBL)
|
||||
for i in range(len(param_list)):
|
||||
op = param_list[i]
|
||||
if(len(op) <= 0):
|
||||
op = Parameter("NONE")
|
||||
elif(op[0] == '$'):
|
||||
op = Parameter("IMD")
|
||||
elif(op[0] == '%' and '(' not in op):
|
||||
j = len(op)
|
||||
opmask = False
|
||||
if('{' in op):
|
||||
j = op.index('{')
|
||||
opmask = True
|
||||
op = Register(op[1:j], opmask)
|
||||
elif('<' in op):
|
||||
op = Parameter("LBL")
|
||||
else:
|
||||
op = MemAddr(op)
|
||||
param_list[i] = op.print()
|
||||
regList[i] = op
|
||||
#Join mnemonic and operand(s) to an instruction form
|
||||
if(len(mnemonic) > 7):
|
||||
tabs = "\t"
|
||||
else:
|
||||
tabs = "\t\t"
|
||||
instr_form = mnemonic+tabs+(" ".join(param_list))
|
||||
#Check in database for instruction form and increment the counter
|
||||
if(instr_form in db):
|
||||
db[instr_form] = db[instr_form]+1
|
||||
else:
|
||||
db[instr_form] = 1
|
||||
#Create testcase for instruction form, since it is the first appearance of it
|
||||
#But (as far as now) only for instr forms with only registers as operands
|
||||
is_Reg = True
|
||||
for par in regList:
|
||||
# print(par.print()+" is Register: "+str(isinstance(par, Register)))
|
||||
if(not isinstance(par, Register)):
|
||||
is_Reg = False
|
||||
if(is_Reg):
|
||||
#print(mnemonic)
|
||||
# print("create testcase for "+mnemonic+" with params:")
|
||||
# for p in regList:
|
||||
# print(p.print(),end=", ")
|
||||
# print()
|
||||
#Create testcase with reversed param list, due to the fact its intel syntax!
|
||||
# create_testcase(mnemonic, list(reversed(regList)))
|
||||
tc = Testcase(mnemonic, list(reversed(regList)), '24')
|
||||
tc.write_testcase()
|
||||
# print("-----------")
|
||||
|
||||
def separate_params(params):
|
||||
param_list = [params]
|
||||
if(',' in params):
|
||||
if(')' in params):
|
||||
if(params.index(')') < len(params)-1 and params[params.index(')')+1] == ','):
|
||||
i = params.index(')')+1
|
||||
elif(params.index('(') < params.index(',')):
|
||||
return param_list
|
||||
else:
|
||||
i = params.index(',')
|
||||
else:
|
||||
i = params.index(',')
|
||||
param_list = [params[:i],separate_params(params[i+1:])]
|
||||
elif('#' in params):
|
||||
i = params.index('#')
|
||||
param_list = [params[:i]]
|
||||
return param_list
|
||||
|
||||
|
||||
def sort_db():
|
||||
global sorted_db
|
||||
sorted_db=sorted(db.items(), key=lambda x:x[1], reverse=True)
|
||||
|
||||
|
||||
def print_sorted_db():
|
||||
sort_db()
|
||||
sum = 0
|
||||
print("Number of\tmnemonic")
|
||||
print("calls\n")
|
||||
for i in range(len(sorted_db)):
|
||||
print(str(sorted_db[i][1])+"\t\t"+sorted_db[i][0])
|
||||
sum += sorted_db[i][1]
|
||||
print("\nCumulated number of instructions: "+str(sum))
|
||||
|
||||
|
||||
def save_db():
|
||||
global db
|
||||
file = open(".cnt_asm_ops.db","w")
|
||||
for i in db.items():
|
||||
file.write(i[0]+"\t"+str(i[1])+"\n")
|
||||
file.close()
|
||||
|
||||
|
||||
def load_db():
|
||||
global db
|
||||
try:
|
||||
file = open(".cnt_asm_ops.db", "r")
|
||||
except FileNotFoundError:
|
||||
print("no database found in current directory")
|
||||
return
|
||||
for line in file:
|
||||
mnemonic = line.split('\t')[0]
|
||||
#Join mnemonic and operand(s) to an instruction form
|
||||
if(len(mnemonic) > 7):
|
||||
tabs = "\t"
|
||||
params = line.split('\t')[1]
|
||||
numCalls = line.split("\t")[2][:-1]
|
||||
else:
|
||||
tabs = "\t\t"
|
||||
params = line.split('\t')[2]
|
||||
numCalls = line.split("\t")[3][:-1]
|
||||
instr_form = mnemonic+tabs+params
|
||||
db[instr_form] = int(numCalls)
|
||||
file.close()
|
||||
|
||||
|
||||
def flatten(l):
|
||||
if l == []:
|
||||
return l
|
||||
if(isinstance(l[0], list)):
|
||||
return flatten(l[0]) + flatten(l[1:])
|
||||
return l[:1] + flatten(l[1:])
|
||||
|
||||
|
||||
|
||||
|
||||
class Parameter(object):
|
||||
type_list = ["REG", "MEM", "IMD", "LBL", "NONE"]
|
||||
def __init__(self, ptype, name=""):
|
||||
self.ptype = ptype.upper()
|
||||
if(self.ptype not in self.type_list):
|
||||
raise NameError("Type not supported: "+ptype)
|
||||
|
||||
def print(self):
|
||||
if(self.ptype == "NONE"):
|
||||
return ""
|
||||
else:
|
||||
return self.ptype
|
||||
|
||||
class MemAddr(Parameter):
|
||||
segment_regs = ["CS", "DS", "SS", "ES", "FS", "GS"]
|
||||
scales = [1, 2, 4, 8]
|
||||
def __init__(self, name):
|
||||
self.sreg = False
|
||||
self.offset = False
|
||||
self.base = False
|
||||
self.index = False
|
||||
self.scale = False
|
||||
if(':' in name):
|
||||
if(name[1:name.index(':')].upper() not in self.segment_regs):
|
||||
raise NameError("Type not supported: "+name)
|
||||
self.sreg = True
|
||||
self.offset = True
|
||||
if('(' not in name or ('(' in name and name.index('(') != 0)):
|
||||
self.offset = True
|
||||
if('(' in name):
|
||||
self.parentheses = name[name.index('(')+1:-1]
|
||||
self.commacnt = self.parentheses.count(',')
|
||||
if(self.commacnt == 0):
|
||||
self.base = True
|
||||
elif(self.commacnt == 2 and int(self.parentheses[-1:]) in self.scales):
|
||||
self.base = True
|
||||
self.index = True
|
||||
self.scale = True
|
||||
else:
|
||||
raise NameError("Type not supported: "+name)
|
||||
|
||||
def print(self):
|
||||
self.mem_format = "MEM("
|
||||
if(self.sreg):
|
||||
self.mem_format += "sreg:"
|
||||
if(self.offset):
|
||||
self.mem_format += "offset"
|
||||
if(self.base and not self.index):
|
||||
self.mem_format += "(base)"
|
||||
elif(self.base and self.index and self.scale):
|
||||
self.mem_format += "(base, index, scale)"
|
||||
self.mem_format += ")"
|
||||
return self.mem_format
|
||||
|
||||
|
||||
|
||||
class Register(Parameter):
|
||||
sizes = {
|
||||
#General Purpose Registers
|
||||
"AH":(8,"GPR"), "AL":(8,"GPR"), "BH":(8,"GPR"), "BL":(8,"GPR"), "CH":(8,"GPR"), "CL":(8,"GPR"), "DH":(8,"GPR"), "DL":(8,"GPR"), "BPL":(8,"GPR"), "SIL":(8,"GPR"), "DIL":(8,"GPR"), "SPL":(8,"GPR"), "R8L":(8,"GPR"), "R9L":(8,"GPR"), "R10L":(8,"GPR"), "R11L":(8,"GPR"), "R12L":(8,"GPR"), "R13L":(8,"GPR"), "R14L":(8,"GPR"), "R15L":(8,"GPR"),
|
||||
"R8B":(8,"GPR"),"R9B":(8,"GPR"),"R10B":(8,"GPR"),"R11B":(8,"GPR"),"R12B":(8,"GPR"),"R13B":(8,"GPR"),"R14B":(8,"GPR"),"R15B":(8,"GPR"),
|
||||
"AX":(16,"GPR"), "BC":(16,"GPR"), "CX":(16,"GPR"), "DX":(16,"GPR"), "BP":(16,"GPR"), "SI":(16,"GPR"), "DI":(16,"GPR"), "SP":(16,"GPR"), "R8W":(16,"GPR"), "R9W":(16,"GPR"), "R10W":(16,"GPR"), "R11W":(16,"GPR"), "R12W":(16,"GPR"), "R13W":(16,"GPR"), "R14W":(16,"GPR"), "R15W":(16,"GPR"),
|
||||
"EAX":(32,"GPR"), "EBX":(32,"GPR"), "ECX":(32,"GPR"), "EDX":(32,"GPR"), "EBP":(32,"GPR"), "ESI":(32,"GPR"), "EDI":(32,"GPR"), "ESP":(32,"GPR"), "R8D":(32,"GPR"), "R9D":(32,"GPR"), "R10D":(32,"GPR"), "R11D":(32,"GPR"), "R12D":(32,"GPR"), "R13D":(32,"GPR"), "R14D":(32,"GPR"), "R15D":(32,"GPR"),
|
||||
"RAX":(64,"GPR"), "RBX":(64,"GPR"), "RCX":(64,"GPR"), "RDX":(64,"GPR"), "RBP":(64,"GPR"), "RSI":(64,"GPR"), "RDI":(64,"GPR"), "RSP":(64,"GPR"), "R8":(64,"GPR"), "R9":(64,"GPR"), "R10":(64,"GPR"), "R11":(64,"GPR"), "R12":(64,"GPR"), "R13":(64,"GPR"), "R14":(64,"GPR"), "R15":(64,"GPR"),
|
||||
"CS":(16,"GPR"), "DS":(16,"GPR"), "SS":(16,"GPR"), "ES":(16,"GPR"), "FS":(16,"GPR"), "GS":(16,"GPR"),
|
||||
"EFLAGS":(32,"GPR"), "RFLAGS":(64,"GPR"), "EIP":(32,"GPR"), "RIP":(64,"GPR"),
|
||||
#FPU Registers
|
||||
"ST0":(80,"FPU"),"ST1":(80,"FPU"),"ST2":(80,"FPU"),"ST3":(80,"FPU"),"ST4":(80,"FPU"),"ST5":(80,"FPU"),"ST6":(80,"FPU"),"ST7":(80,"FPU"),
|
||||
#MMX Registers
|
||||
"MM0":(64,"MMX"),"MM1":(64,"MMX"),"MM2":(64,"MMX"),"MM3":(64,"MMX"),"MM4":(64,"MMX"),"MM5":(64,"MMX"),"MM6":(64,"MMX"),"MM7":(64,"MMX"),
|
||||
#XMM Registers
|
||||
"XMM0":(128,"XMM"),"XMM1":(128,"XMM"),"XMM2":(128,"XMM"),"XMM3":(128,"XMM"),"XMM4":(128,"XMM"),"XMM5":(128,"XMM"),"XMM6":(128,"XMM"),"XMM7":(128,"XMM"), "XMM8":(128,"XMM"), "XMM9":(128,"XMM"), "XMM10":(128,"XMM"), "XMM11":(128,"XMM"), "XMM12":(128,"XMM"), "XMM13":(128,"XMM"), "XMM14":(128,"XMM"), "XMM15":(128,"XMM"), "XMM16":(128,"XMM"), "XMM17":(128,"XMM"), "XMM18":(128,"XMM"), "XMM19":(128,"XMM"), "XMM20":(128,"XMM"), "XMM21":(128,"XMM"), "XMM22":(128,"XMM"), "XMM23":(128,"XMM"), "XMM24":(128,"XMM"), "XMM25":(128,"XMM"), "XMM26":(128,"XMM"), "XMM27":(128,"XMM"), "XMM28":(128,"XMM"), "XMM29":(128,"XMM"), "XMM30":(128,"XMM"), "XMM31":(128,"XMM"),
|
||||
#YMM Registers
|
||||
"YMM0":(256,"YMM"),"YMM1":(256,"YMM"),"YMM2":(256,"YMM"),"YMM3":(256,"YMM"),"YMM4":(256,"YMM"),"YMM5":(256,"YMM"),"YMM6":(256,"YMM"),"YMM7":(256,"YMM"), "YMM8":(256,"YMM"), "YMM9":(256,"YMM"), "YMM10":(256,"YMM"), "YMM11":(256,"YMM"), "YMM12":(256,"YMM"), "YMM13":(256,"YMM"), "YMM14":(256,"YMM"), "YMM15":(256,"YMM"), "YMM16":(256,"YMM"), "YMM17":(256,"YMM"), "YMM18":(256,"YMM"), "YMM19":(256,"YMM"), "YMM20":(256,"YMM"), "YMM21":(256,"YMM"), "YMM22":(256,"YMM"), "YMM23":(256,"YMM"), "YMM24":(256,"YMM"), "YMM25":(256,"YMM"), "YMM26":(256,"YMM"), "YMM27":(256,"YMM"), "YMM28":(256,"YMM"), "YMM29":(256,"YMM"), "YMM30":(256,"YMM"), "YMM31":(256,"YMM"),
|
||||
#ZMM Registers
|
||||
"ZMM0":(512,"ZMM"),"ZMM1":(512,"ZMM"),"ZMM2":(512,"ZMM"),"ZMM3":(512,"ZMM"),"ZMM4":(512,"ZMM"),"ZMM5":(512,"ZMM"),"ZMM6":(512,"ZMM"),"ZMM7":(512,"ZMM"), "ZMM8":(512,"ZMM"), "ZMM9":(512,"ZMM"), "ZMM10":(512,"ZMM"), "ZMM11":(512,"ZMM"), "ZMM12":(512,"ZMM"), "ZMM13":(512,"ZMM"), "ZMM14":(512,"ZMM"), "ZMM15":(512,"ZMM"), "ZMM16":(512,"ZMM"), "ZMM17":(512,"ZMM"), "ZMM18":(512,"ZMM"), "ZMM19":(512,"ZMM"), "ZMM20":(512,"ZMM"), "ZMM21":(512,"ZMM"), "ZMM22":(512,"ZMM"), "ZMM23":(512,"ZMM"), "ZMM24":(512,"ZMM"), "ZMM25":(512,"ZMM"), "ZMM26":(512,"ZMM"), "ZMM27":(512,"ZMM"), "ZMM28":(512,"ZMM"), "ZMM29":(512,"ZMM"), "ZMM30":(512,"ZMM"), "ZMM31":(512,"ZMM"),
|
||||
#Opmask Register
|
||||
"K0":(64,"K"), "K1":(64,"K"), "K2":(64,"K"), "K3":(64,"K"), "K4":(64,"K"), "K5":(64,"K"), "K6":(64,"K"), "K7":(64,"K"),
|
||||
#Bounds Registers
|
||||
"BND0":(128,"BND"),"BND1":(128,"BND"),"BND2":(128,"BND"),"BND3":(128,"BND")
|
||||
}
|
||||
|
||||
def __init__(self,name,mask=False):
|
||||
self.name = name.upper()
|
||||
self.mask = mask
|
||||
# try:
|
||||
if[name in self.sizes]:
|
||||
self.size = self.sizes[self.name][0]
|
||||
self.reg_type = self.sizes[self.name][1]
|
||||
else:
|
||||
print(lncnt)
|
||||
raise NameError("Register name not in dictionary: "+self.name)
|
||||
# except KeyError:
|
||||
# print(lncnt)
|
||||
|
||||
def print(self):
|
||||
opmask = ""
|
||||
if(self.mask):
|
||||
opmask = "{opmask}"
|
||||
return(self.reg_type+str(self.size)+opmask)
|
||||
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# load_db()
|
||||
r0 = Register("ymm0")
|
||||
r1 = Register("xmm0")
|
||||
r2 = Register("rax")
|
||||
# create_testcase("VMOVQ", [r1,r2])
|
||||
# create_testcase("VADDPD", [r0, r0, r0])
|
||||
if(len(sys.argv) > 1):
|
||||
for i in range(1,len(sys.argv)):
|
||||
extract_instr(sys.argv[i])
|
||||
print_sorted_db()
|
||||
|
||||
# save_db()
|
||||
432
osaca.py
Executable file
432
osaca.py
Executable file
@@ -0,0 +1,432 @@
|
||||
#!/apps/python/3.5-anaconda/bin/python
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import subprocess
|
||||
import os
|
||||
import re
|
||||
import Params
|
||||
import pandas as pd
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
#----------Global variables--------------
|
||||
arch = ''
|
||||
archList = ['SNB','IVB','HSW', 'BDW', 'SKL']
|
||||
filepath = ''
|
||||
srcCode = ''
|
||||
marker = r'//STARTLOOP'
|
||||
asm_line = re.compile(r'\s[0-9a-f]+[:]')
|
||||
numSeps = 0
|
||||
sem = 0
|
||||
firstAppearance = True
|
||||
lncnt = 0
|
||||
instrForms = list()
|
||||
df = ''
|
||||
output = ''
|
||||
horizontalSeparator = ''
|
||||
total_tp = 0
|
||||
longestInstr = 30
|
||||
cycList = []
|
||||
reciList = []
|
||||
#---------------------------------------
|
||||
|
||||
# Check if the architecture arg is valid
|
||||
def check_arch():
|
||||
if(arch in archList):
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
# Check if the given filepath exists and if the format is the needed elf64
|
||||
def check_elffile():
|
||||
if(os.path.isfile(filepath)):
|
||||
create_elffile()
|
||||
if('file format elf64' in srcCode[1]):
|
||||
return True
|
||||
return False
|
||||
|
||||
# Check if the given filepath exists
|
||||
def check_file():
|
||||
if(os.path.isfile(filepath)):
|
||||
get_file()
|
||||
return True
|
||||
return False
|
||||
|
||||
# Load binary file in variable srcCode and separate by line
|
||||
def create_elffile():
|
||||
global srcCode
|
||||
srcCode = subprocess.run(['objdump', '--source', filepath], stdout=subprocess.PIPE).stdout.decode('utf-8').split('\n')
|
||||
|
||||
# Load arbitrary file in variable srcCode and separate by line
|
||||
def get_file():
|
||||
global srcCode
|
||||
try:
|
||||
f = open(filepath, 'r')
|
||||
except IOError:
|
||||
print('IOError: file \'{}\' not found'.format(filepath))
|
||||
for line in f:
|
||||
srcCode += line
|
||||
f.close()
|
||||
srcCode = srcCode.split('\n')
|
||||
|
||||
|
||||
def check_line(line):
|
||||
global numSeps
|
||||
global sem
|
||||
global firstAppearance
|
||||
# Check if marker is in line
|
||||
if(marker in line):
|
||||
# First, check if high level code in indented with whitespaces or tabs
|
||||
if(firstAppearance):
|
||||
set_char_counter(line)
|
||||
firstAppearance = False
|
||||
# Now count the number of whitespaces
|
||||
numSeps = (re.split(marker, line)[0]).count(cntChar)
|
||||
sem = 2
|
||||
elif(sem > 0):
|
||||
# We're in the marked code snippet
|
||||
# Check if the line is ASM code and - if not - check if we're still in the loop
|
||||
match = re.search(asm_line, line)
|
||||
if(match):
|
||||
# Further analysis of instructions
|
||||
# Check if there are comments in line
|
||||
if(r'//' in line):
|
||||
return
|
||||
check_instr(''.join(re.split(r'\t', line)[-1:]))
|
||||
elif((re.split(r'\S', line)[0]).count(cntChar) <= numSeps):
|
||||
# Not in the loop anymore - or yet. We decrement the semaphore
|
||||
sem = sem-1
|
||||
|
||||
|
||||
# Check if separators are either tabulators or whitespaces
|
||||
def set_char_counter(line):
|
||||
global cntChar
|
||||
numSpaces = (re.split(marker, line)[0]).count(' ')
|
||||
numTabs = (re.split(marker, line)[0]).count('\t')
|
||||
if(numSpaces != 0 and numTabs == 0):
|
||||
cntChar = ' '
|
||||
elif(numSpaces == 0 and numTabs != 0):
|
||||
cntChar = '\t'
|
||||
else:
|
||||
raise NotImplementedError('Indentation of code is only supported for whitespaces and tabs.')
|
||||
|
||||
|
||||
def check_instr(instr):
|
||||
global instrForms
|
||||
global longestInstr
|
||||
# Check for strange clang padding bytes
|
||||
while(instr.startswith('data32')):
|
||||
instr = instr[7:]
|
||||
# Separate mnemonic and operands
|
||||
mnemonic = instr.split()[0]
|
||||
params = ''.join(instr.split()[1:])
|
||||
# Check if line is not only a byte
|
||||
empty_byte = re.compile(r'[0-9a-f]{2}')
|
||||
if(re.match(empty_byte, mnemonic) and len(mnemonic) == 2):
|
||||
return
|
||||
# Check if there's one or more operands and store all in a list
|
||||
param_list = flatten(separate_params(params))
|
||||
param_list_types = list(param_list)
|
||||
# check operands and separate them by IMMEDIATE (IMD), REGISTER (REG). MEMORY (MEM) or LABEL(LBL)
|
||||
for i in range(len(param_list)):
|
||||
op = param_list[i]
|
||||
if(len(op) <= 0):
|
||||
op = Params.Parameter('NONE')
|
||||
elif(op[0] == '$'):
|
||||
op = Params.Parameter('IMD')
|
||||
elif(op[0] == '%' and '(' not in op):
|
||||
j = len(op)
|
||||
opmask = False
|
||||
if('{' in op):
|
||||
j = op.index('{')
|
||||
opmask = True
|
||||
op = Params.Register(op[1:j], opmask)
|
||||
elif('<' in op):
|
||||
op = Params.Parameter('LBL')
|
||||
else:
|
||||
op = Params.MemAddr(op)
|
||||
param_list[i] = op.print()
|
||||
param_list_types[i] = op
|
||||
#Add to list
|
||||
if(len(instr) > longestInstr):
|
||||
longestInstr = len(instr)
|
||||
instrForm = [mnemonic]+list(reversed(param_list_types))+[instr]
|
||||
instrForms.append(instrForm)
|
||||
|
||||
def separate_params(params):
|
||||
param_list = [params]
|
||||
if(',' in params):
|
||||
if(')' in params):
|
||||
if(params.index(')') < len(params)-1 and params[params.index(')')+1] == ','):
|
||||
i = params.index(')')+1
|
||||
elif(params.index('(') < params.index(',')):
|
||||
return param_list
|
||||
else:
|
||||
i = params.index(',')
|
||||
else:
|
||||
i = params.index(',')
|
||||
param_list = [params[:i],separate_params(params[i+1:])]
|
||||
elif('#' in params):
|
||||
i = params.index('#')
|
||||
param_list = [params[:i]]
|
||||
return param_list
|
||||
|
||||
def flatten(l):
|
||||
if l == []:
|
||||
return l
|
||||
if(isinstance(l[0], list)):
|
||||
return flatten(l[0]) + flatten(l[1:])
|
||||
return l[:1] + flatten(l[1:])
|
||||
|
||||
def read_csv():
|
||||
global df
|
||||
df = pd.read_csv('data/'+arch.lower()+'_throughput.csv')
|
||||
|
||||
def create_horiz_sep():
|
||||
global horizontalSeparator
|
||||
horizontalSeparator = '-'*(longestInstr+8)
|
||||
|
||||
def create_output():
|
||||
global total_tp
|
||||
global output
|
||||
global longestInstr
|
||||
warning = False
|
||||
|
||||
#Check the output alignment depending on the longest instruction
|
||||
if(longestInstr > 70):
|
||||
longestInstr = 70
|
||||
create_horiz_sep()
|
||||
ws = ' '*(len(horizontalSeparator)-23)
|
||||
# Write general information about the benchmark
|
||||
output = ( '--'+horizontalSeparator+'\n'
|
||||
'| Analyzing of file:\t'+os.getcwd()+'/'+filepath+'\n'
|
||||
'| Architecture:\t\t'+arch+'\n'
|
||||
'| Timestamp:\t\t'+datetime.now().strftime('%Y-%m-%d %H:%M:%S')+'\n'
|
||||
'|\n| INSTRUCTION'+ws+'CLOCK CYCLES\n'
|
||||
'| '+horizontalSeparator+'\n|\n')
|
||||
# Check for the throughput data in CSV
|
||||
# First determine if we're searching for the SSE, AVX or AVX512 type of instruction
|
||||
for elem in instrForms:
|
||||
extension = ''
|
||||
avx = False
|
||||
avx512 = False
|
||||
opExt = []
|
||||
for i in range(1, len(elem)-1):
|
||||
opExt.append('r'+str(elem[i].size) if (isinstance(elem[i], Params.Register) and elem[i].reg_type == 'GPR') else elem[i].print().lower())
|
||||
# Due to the fact we store the explicit operands, we don't need anyu avx/avx512 extension
|
||||
# for op in elem[1:-1]:
|
||||
# if(isinstance(op,Params.Register) and op.reg_type == 'YMM'):
|
||||
# avx = True
|
||||
# elif(isinstance(op,Params.Register) and op.reg_type == 'ZMM'):
|
||||
# avx512 = True
|
||||
# break
|
||||
# if(avx512):
|
||||
# extension = '-avx512'
|
||||
# elif(avx):
|
||||
# extension = '-avx'
|
||||
operands = '_'.join(opExt)
|
||||
# Now look up the value in the dataframe
|
||||
# Check if there is a stored throughput value in database
|
||||
series = df['instr'].str.contains(elem[0]+'-'+operands+'-TP')
|
||||
if( True in series.values):
|
||||
# It's a match!
|
||||
notFound = False
|
||||
try:
|
||||
tp = df[df.instr == elem[0]+'-'+operands+'-TP'].clock_cycles.values[0]
|
||||
except IndexError:
|
||||
# Something went wrong
|
||||
print('Error while fetching data from database')
|
||||
continue
|
||||
# Did not found the exact instruction form.
|
||||
# Try to find the instruction form for register operands only
|
||||
else:
|
||||
opExtRegs = []
|
||||
for operand in opExt:
|
||||
try:
|
||||
regTmp = Register(operand)
|
||||
opExtRegs.append(True)
|
||||
except KeyError:
|
||||
opExtRegs.append(False)
|
||||
pass
|
||||
if(not True in opExtRegs):
|
||||
# No register in whole instruction form. How can I found out what regsize we need?
|
||||
print('Feature not included yet')
|
||||
tp = 0
|
||||
notFound = True
|
||||
warning = True
|
||||
continue
|
||||
if(opExtRegs[0] == False):
|
||||
# Instruction stores result in memory. Check for storing in register instead
|
||||
if(len(opExt) > 1):
|
||||
if(opExtRegs[1] == True):
|
||||
opExt[0] = opExt[1]
|
||||
elif(len(optExt > 2):
|
||||
if(opExtRegs[2] == True):
|
||||
opExt[0] = opExt[2]
|
||||
if(len(opExtRegs) == 2 and opExtRegs[1] == False):
|
||||
# Instruction loads value from memory and has only two operands. Check for loading from register instead
|
||||
if(opExtRegs[0] == True):
|
||||
opExt[1] = opExt[0]
|
||||
if(len)opExtRegs) == 3 and opExtRegs[2] == False):
|
||||
# Instruction loads value from memorz and has three operands. Check for loading from register instead
|
||||
opExt[2] = opExt[0]
|
||||
operands = '_'.join(opExt)
|
||||
# Check for register equivalent instruction
|
||||
series = df['instr'].str.contains(elem[0]+'-'+operands+'-TP')
|
||||
if( True in series.values):
|
||||
# It's a match!
|
||||
notFound = False
|
||||
try:
|
||||
tp = df[df.instr == elem[0]+'-'+operands+'-TP'].clock_cycles.values[0]
|
||||
except IndexError:
|
||||
# Something went wrong
|
||||
print('Error while fetching data from database')
|
||||
continue
|
||||
# Did not found the register instruction form. Set warning and go on with throughput 0
|
||||
else:
|
||||
tp = 0
|
||||
notFound = True
|
||||
warning = True
|
||||
# Add it to the overall throughput
|
||||
total_tp += tp
|
||||
# Check the alignement again
|
||||
numWhitespaces = longestInstr-len(elem[-1])
|
||||
ws = ' '*numWhitespaces+'| '
|
||||
n_f = ''
|
||||
if(notFound):
|
||||
n_f = ' '*(5-len(str(tp)))+'*'
|
||||
data = '| '+elem[-1]+ws+str(tp)+n_f+'\n'
|
||||
output += data
|
||||
# Finally write the total throughput
|
||||
numWhitespaces = longestInstr-27
|
||||
ws = ' '+' '*numWhitespaces
|
||||
output += ( '| '+horizontalSeparator+'\n'
|
||||
'| TOTAL ESTIMATED THROUGHPUT:'+ws+str(total_tp))
|
||||
if(warning):
|
||||
output += ('\n\n* There was no throughput value found '
|
||||
'for the specific instruction form.'
|
||||
'\n Please create a testcase via the create_testcase-method '
|
||||
'or add a value manually.')
|
||||
|
||||
def create_sequences():
|
||||
global cycList
|
||||
global reciList
|
||||
|
||||
for i in range(1, 101):
|
||||
cycList.append(i)
|
||||
reciList.append(1/i)
|
||||
|
||||
def validate_TP(clkC, instr):
|
||||
for i in range(0, 100):
|
||||
if(cycList[i]*1.05 > float(clkC) and cycList[i]*0.95 < float(clkC)):
|
||||
# Value is probably correct, so round it to the estimated value
|
||||
return cycList[i]
|
||||
elif(reciList[i]*1.05 > float(clkC) and reciList[i]*0.95 < float(clkC)):
|
||||
# Value is probably correct, so round it to the estimated value
|
||||
return reciList[i]
|
||||
# No value close to an integer or its reciprokal found, we assume the measurement is incorrect
|
||||
print('Your measurement for {} is probably wrong. Please inspect your benchmark!'.format(instr))
|
||||
print('The program will continue with the given value')
|
||||
return clkC
|
||||
|
||||
def write_csv(csv):
|
||||
try:
|
||||
f = open('data/'+arch.lower()+'_throughput.csv', 'w')
|
||||
except IOError:
|
||||
print('IOError: file \'{}\' not found in ./data'.format(arch.lower()+'_throughput.csv'))
|
||||
f.write(csv)
|
||||
f.close()
|
||||
|
||||
##---------------main functions depending on arguments----------------------
|
||||
|
||||
#reads ibench output and includes it in the architecture specific csv file
|
||||
def include_ibench():
|
||||
global df
|
||||
|
||||
# Check args and exit program if something's wrong
|
||||
if(not check_arch()):
|
||||
print('Invalid microarchitecture.')
|
||||
sys.exit()
|
||||
if(not check_file()):
|
||||
print('Invalid file path or file format.')
|
||||
sys.exit()
|
||||
# Check for database for the chosen architecture
|
||||
read_csv()
|
||||
# Create sequence of numbers and their reciprokals for validate the measurements
|
||||
create_sequences()
|
||||
|
||||
print('Everything seems fine! Let\'s start checking!')
|
||||
newData = []
|
||||
for line in srcCode:
|
||||
if('TP' in line):
|
||||
# We found a command with a throughput value. Get instruction and the number of clock cycles
|
||||
instr = line.split()[0][:-1]
|
||||
clkC = line.split()[1]
|
||||
clkC = validate_TP(clkC, instr)
|
||||
tp = -1
|
||||
new = False
|
||||
try:
|
||||
tp = df.loc[lambda df: df.instr == instr,'clock_cycles'].values[0]
|
||||
except IndexError:
|
||||
# Instruction not in database yet --> add it
|
||||
newData.append([instr,clkC])
|
||||
new = True
|
||||
pass
|
||||
if(not new and tp != clkC):
|
||||
print('Different measurement for {}: {}(old) vs. {}(new)\nPlease check for correctness (no changes were made).'.format(instr, tp, clkC))
|
||||
# Now merge the DataFrames and write new csv file
|
||||
df = df.append(pd.DataFrame(newData, columns=['instr','clock_cycles']), ignore_index=True)
|
||||
csv = df.to_csv(index=False)
|
||||
write_csv(csv)
|
||||
print('ibench output {} successfully in database included.'.format(filepath.split('/')[-1]))
|
||||
|
||||
|
||||
# main function of the tool
|
||||
def inspect_binary():
|
||||
# Check args and exit program if something's wrong
|
||||
if(not check_arch()):
|
||||
print('Invalid microarchitecture.')
|
||||
sys.exit()
|
||||
if(not check_elffile()):
|
||||
print('Invalid file path or file format.')
|
||||
sys.exit()
|
||||
# Finally check for database for the chosen architecture
|
||||
read_csv()
|
||||
|
||||
print('Everything seems fine! Let\'s start checking!')
|
||||
for line in srcCode:
|
||||
lncnt += 1
|
||||
check_line(line)
|
||||
create_output()
|
||||
print(output)
|
||||
|
||||
##------------------------------------------------------------------------------
|
||||
##------------Main method--------------
|
||||
def main():
|
||||
global lncnt
|
||||
global inp
|
||||
global arch
|
||||
global filepath
|
||||
# Parse args
|
||||
parser = argparse.ArgumentParser(description='Analyzes a marked innermost loop snippet for a given architecture type and prints out the estimated average throughput')
|
||||
parser.add_argument('--version', '-V', action='version', version='%(prog)s 0.1')
|
||||
parser.add_argument('--arch', dest='arch', type=str, help='define architecture')
|
||||
parser.add_argument('filepath', type=str, help='path to object (Binary, CSV)')
|
||||
parser.add_argument('--include-ibench', '-i', dest='incl', action='store_true', help='includes the given values in form of the output of ibench in the database')
|
||||
|
||||
# Store args in global variables
|
||||
inp = parser.parse_args()
|
||||
arch = inp.arch.upper()
|
||||
filepath = inp.filepath
|
||||
inclIbench = inp.incl
|
||||
|
||||
if(inclIbench):
|
||||
include_ibench()
|
||||
else:
|
||||
inspect_binary()
|
||||
|
||||
|
||||
##------------Main method--------------
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
100
testcases/add-rr-TP.S
Normal file
100
testcases/add-rr-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR add
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR edx, eax
|
||||
INSTR r9d, ebx
|
||||
INSTR r10d, ecx
|
||||
INSTR edx, eax
|
||||
INSTR r9d, ebx
|
||||
INSTR r10d, ecx
|
||||
INSTR r11d, eax
|
||||
INSTR r12d, ebx
|
||||
INSTR r13d, ecx
|
||||
INSTR r14d, eax
|
||||
INSTR r15d, ebx
|
||||
INSTR eax, ecx
|
||||
INSTR ebx, eax
|
||||
INSTR ecx, ebx
|
||||
INSTR edx, ecx
|
||||
INSTR r9d, eax
|
||||
INSTR r10d, ebx
|
||||
INSTR r11d, ecx
|
||||
INSTR r12d, eax
|
||||
INSTR r13d, ebx
|
||||
INSTR r14d, ecx
|
||||
INSTR r15d, eax
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, ecx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/add-rr.S
Normal file
100
testcases/add-rr.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR add
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, eax
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, eax
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, eax
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, eax
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, eax
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, eax
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, eax
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, eax
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, eax
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, eax
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, eax
|
||||
INSTR eax, ebx
|
||||
INSTR ebx, eax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/cmp-rr-TP.S
Normal file
100
testcases/cmp-rr-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR cmp
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rdx, rax
|
||||
INSTR r9, rbx
|
||||
INSTR r10, rcx
|
||||
INSTR rdx, rax
|
||||
INSTR r9, rbx
|
||||
INSTR r10, rcx
|
||||
INSTR r11, rax
|
||||
INSTR r12, rbx
|
||||
INSTR r13, rcx
|
||||
INSTR r14, rax
|
||||
INSTR r15, rbx
|
||||
INSTR rax, rcx
|
||||
INSTR rbx, rax
|
||||
INSTR rcx, rbx
|
||||
INSTR rdx, rcx
|
||||
INSTR r9, rax
|
||||
INSTR r10, rbx
|
||||
INSTR r11, rcx
|
||||
INSTR r12, rax
|
||||
INSTR r13, rbx
|
||||
INSTR r14, rcx
|
||||
INSTR r15, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rcx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/cmp-rr.S
Normal file
100
testcases/cmp-rr.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR cmp
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/dec-r-TP.S
Normal file
100
testcases/dec-r-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR dec
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR edx
|
||||
INSTR r9d
|
||||
INSTR r10d
|
||||
INSTR edx
|
||||
INSTR r9d
|
||||
INSTR r10d
|
||||
INSTR r11d
|
||||
INSTR r12d
|
||||
INSTR r13d
|
||||
INSTR r14d
|
||||
INSTR r15d
|
||||
INSTR eax
|
||||
INSTR ebx
|
||||
INSTR ecx
|
||||
INSTR edx
|
||||
INSTR r9d
|
||||
INSTR r10d
|
||||
INSTR r11d
|
||||
INSTR r12d
|
||||
INSTR r13d
|
||||
INSTR r14d
|
||||
INSTR r15d
|
||||
INSTR eax
|
||||
INSTR ebx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/dec-r.S
Normal file
100
testcases/dec-r.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR dec
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/inc-r-TP.S
Normal file
100
testcases/inc-r-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR inc
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rdx
|
||||
INSTR r9
|
||||
INSTR r10
|
||||
INSTR rdx
|
||||
INSTR r9
|
||||
INSTR r10
|
||||
INSTR r11
|
||||
INSTR r12
|
||||
INSTR r13
|
||||
INSTR r14
|
||||
INSTR r15
|
||||
INSTR rax
|
||||
INSTR rbx
|
||||
INSTR rcx
|
||||
INSTR rdx
|
||||
INSTR r9
|
||||
INSTR r10
|
||||
INSTR r11
|
||||
INSTR r12
|
||||
INSTR r13
|
||||
INSTR r14
|
||||
INSTR r15
|
||||
INSTR rax
|
||||
INSTR rbx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/inc-r.S
Normal file
100
testcases/inc-r.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR inc
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
82
testcases/janadd-r64r32-TP.S
Normal file
82
testcases/janadd-r64r32-TP.S
Normal file
@@ -0,0 +1,82 @@
|
||||
#define INSTR janadd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rdx, eax
|
||||
INSTR r9, ebx
|
||||
INSTR r10, ecx
|
||||
INSTR rdx, eax
|
||||
INSTR r9, ebx
|
||||
INSTR r10, ecx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
82
testcases/janadd-r64r32.S
Normal file
82
testcases/janadd-r64r32.S
Normal file
@@ -0,0 +1,82 @@
|
||||
#define INSTR janadd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
82
testcases/janadd-rr-TP.S
Normal file
82
testcases/janadd-rr-TP.S
Normal file
@@ -0,0 +1,82 @@
|
||||
#define INSTR janadd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rdx, eax
|
||||
INSTR r9, ebx
|
||||
INSTR r10, ecx
|
||||
INSTR rdx, eax
|
||||
INSTR r9, ebx
|
||||
INSTR r10, ecx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
82
testcases/janadd-rr.S
Normal file
82
testcases/janadd-rr.S
Normal file
@@ -0,0 +1,82 @@
|
||||
#define INSTR janadd
|
||||
#define NINST 6
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/mov-rr-TP.S
Normal file
100
testcases/mov-rr-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR mov
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rdx, rax
|
||||
INSTR r9, rbx
|
||||
INSTR r10, rcx
|
||||
INSTR rdx, rax
|
||||
INSTR r9, rbx
|
||||
INSTR r10, rcx
|
||||
INSTR r11, rax
|
||||
INSTR r12, rbx
|
||||
INSTR r13, rcx
|
||||
INSTR r14, rax
|
||||
INSTR r15, rbx
|
||||
INSTR rax, rcx
|
||||
INSTR rbx, rax
|
||||
INSTR rcx, rbx
|
||||
INSTR rdx, rcx
|
||||
INSTR r9, rax
|
||||
INSTR r10, rbx
|
||||
INSTR r11, rcx
|
||||
INSTR r12, rax
|
||||
INSTR r13, rbx
|
||||
INSTR r14, rcx
|
||||
INSTR r15, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rcx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/mov-rr.S
Normal file
100
testcases/mov-rr.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR mov
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/movslq-rr-TP.S
Normal file
100
testcases/movslq-rr-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR movslq
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rdx, eax
|
||||
INSTR r9, ebx
|
||||
INSTR r10, ecx
|
||||
INSTR rdx, eax
|
||||
INSTR r9, ebx
|
||||
INSTR r10, ecx
|
||||
INSTR r11, eax
|
||||
INSTR r12, ebx
|
||||
INSTR r13, ecx
|
||||
INSTR r14, eax
|
||||
INSTR r15, ebx
|
||||
INSTR rax, ecx
|
||||
INSTR rbx, eax
|
||||
INSTR rcx, ebx
|
||||
INSTR rdx, ecx
|
||||
INSTR r9, eax
|
||||
INSTR r10, ebx
|
||||
INSTR r11, ecx
|
||||
INSTR r12, eax
|
||||
INSTR r13, ebx
|
||||
INSTR r14, ecx
|
||||
INSTR r15, eax
|
||||
INSTR rax, ebx
|
||||
INSTR rbx, ecx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/movslq-rr.S
Normal file
100
testcases/movslq-rr.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR movslq
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
INSTR rax, eax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/movzbl-rr-TP.S
Normal file
100
testcases/movzbl-rr-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR movzbl
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR edx, al
|
||||
INSTR r9d, bl
|
||||
INSTR r10d, cl
|
||||
INSTR edx, al
|
||||
INSTR r9d, bl
|
||||
INSTR r10d, cl
|
||||
INSTR r11d, al
|
||||
INSTR r12d, bl
|
||||
INSTR r13d, cl
|
||||
INSTR r14d, al
|
||||
INSTR r15d, bl
|
||||
INSTR eax, cl
|
||||
INSTR ebx, al
|
||||
INSTR ecx, bl
|
||||
INSTR edx, cl
|
||||
INSTR r9d, al
|
||||
INSTR r10d, bl
|
||||
INSTR r11d, cl
|
||||
INSTR r12d, al
|
||||
INSTR r13d, bl
|
||||
INSTR r14d, cl
|
||||
INSTR r15d, al
|
||||
INSTR eax, bl
|
||||
INSTR ebx, cl
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/movzbl-rr.S
Normal file
100
testcases/movzbl-rr.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR movzbl
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
INSTR eax, al
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/neg-r-TP.S
Normal file
100
testcases/neg-r-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR neg
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR edx
|
||||
INSTR r9d
|
||||
INSTR r10d
|
||||
INSTR edx
|
||||
INSTR r9d
|
||||
INSTR r10d
|
||||
INSTR r11d
|
||||
INSTR r12d
|
||||
INSTR r13d
|
||||
INSTR r14d
|
||||
INSTR r15d
|
||||
INSTR eax
|
||||
INSTR ebx
|
||||
INSTR ecx
|
||||
INSTR edx
|
||||
INSTR r9d
|
||||
INSTR r10d
|
||||
INSTR r11d
|
||||
INSTR r12d
|
||||
INSTR r13d
|
||||
INSTR r14d
|
||||
INSTR r15d
|
||||
INSTR eax
|
||||
INSTR ebx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/neg-r.S
Normal file
100
testcases/neg-r.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR neg
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
INSTR eax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/pop-r-TP.S
Normal file
100
testcases/pop-r-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR pop
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rdx
|
||||
INSTR r9
|
||||
INSTR r10
|
||||
INSTR rdx
|
||||
INSTR r9
|
||||
INSTR r10
|
||||
INSTR r11
|
||||
INSTR r12
|
||||
INSTR r13
|
||||
INSTR r14
|
||||
INSTR r15
|
||||
INSTR rax
|
||||
INSTR rbx
|
||||
INSTR rcx
|
||||
INSTR rdx
|
||||
INSTR r9
|
||||
INSTR r10
|
||||
INSTR r11
|
||||
INSTR r12
|
||||
INSTR r13
|
||||
INSTR r14
|
||||
INSTR r15
|
||||
INSTR rax
|
||||
INSTR rbx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/pop-r.S
Normal file
100
testcases/pop-r.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR pop
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
INSTR rax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/sub-rr-TP.S
Normal file
100
testcases/sub-rr-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR sub
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rdx, rax
|
||||
INSTR r9, rbx
|
||||
INSTR r10, rcx
|
||||
INSTR rdx, rax
|
||||
INSTR r9, rbx
|
||||
INSTR r10, rcx
|
||||
INSTR r11, rax
|
||||
INSTR r12, rbx
|
||||
INSTR r13, rcx
|
||||
INSTR r14, rax
|
||||
INSTR r15, rbx
|
||||
INSTR rax, rcx
|
||||
INSTR rbx, rax
|
||||
INSTR rcx, rbx
|
||||
INSTR rdx, rcx
|
||||
INSTR r9, rax
|
||||
INSTR r10, rbx
|
||||
INSTR r11, rcx
|
||||
INSTR r12, rax
|
||||
INSTR r13, rbx
|
||||
INSTR r14, rcx
|
||||
INSTR r15, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rcx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/sub-rr.S
Normal file
100
testcases/sub-rr.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR sub
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/test-rr-TP.S
Normal file
100
testcases/test-rr-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR test
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rdx, rax
|
||||
INSTR r9, rbx
|
||||
INSTR r10, rcx
|
||||
INSTR rdx, rax
|
||||
INSTR r9, rbx
|
||||
INSTR r10, rcx
|
||||
INSTR r11, rax
|
||||
INSTR r12, rbx
|
||||
INSTR r13, rcx
|
||||
INSTR r14, rax
|
||||
INSTR r15, rbx
|
||||
INSTR rax, rcx
|
||||
INSTR rbx, rax
|
||||
INSTR rcx, rbx
|
||||
INSTR rdx, rcx
|
||||
INSTR r9, rax
|
||||
INSTR r10, rbx
|
||||
INSTR r11, rcx
|
||||
INSTR r12, rax
|
||||
INSTR r13, rbx
|
||||
INSTR r14, rcx
|
||||
INSTR r15, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rcx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/test-rr.S
Normal file
100
testcases/test-rr.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR test
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
INSTR rax, rbx
|
||||
INSTR rbx, rax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
67
testcases/vaddpd-avx-ymmymmymm-TP.S
Normal file
67
testcases/vaddpd-avx-ymmymmymm-TP.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR vaddpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm0
|
||||
# Create DP 2.0
|
||||
vaddpd ymm1, ymm1, ymm1
|
||||
# Create DP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm0
|
||||
INSTR ymm4, ymm1, ymm1
|
||||
INSTR ymm5, ymm2, ymm2
|
||||
INSTR ymm3, ymm0, ymm0
|
||||
INSTR ymm4, ymm1, ymm1
|
||||
INSTR ymm5, ymm2, ymm2
|
||||
INSTR ymm6, ymm0, ymm0
|
||||
INSTR ymm7, ymm1, ymm1
|
||||
INSTR ymm8, ymm2, ymm2
|
||||
INSTR ymm9, ymm0, ymm0
|
||||
INSTR ymm10, ymm1, ymm1
|
||||
INSTR ymm11, ymm2, ymm2
|
||||
INSTR ymm12, ymm0, ymm0
|
||||
INSTR ymm13, ymm1, ymm1
|
||||
INSTR ymm14, ymm2, ymm2
|
||||
INSTR ymm15, ymm0, ymm0
|
||||
INSTR ymm16, ymm1, ymm1
|
||||
INSTR ymm17, ymm2, ymm2
|
||||
INSTR ymm18, ymm0, ymm0
|
||||
INSTR ymm19, ymm1, ymm1
|
||||
INSTR ymm20, ymm2, ymm2
|
||||
INSTR ymm21, ymm0, ymm0
|
||||
INSTR ymm22, ymm1, ymm1
|
||||
INSTR ymm23, ymm2, ymm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
67
testcases/vaddpd-avx-ymmymmymm.S
Normal file
67
testcases/vaddpd-avx-ymmymmymm.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR vaddpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm0
|
||||
# Create DP 2.0
|
||||
vaddpd ymm1, ymm1, ymm1
|
||||
# Create DP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vaddpd-xmmxmmxmm-TP.S
Normal file
65
testcases/vaddpd-xmmxmmxmm-TP.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vaddpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm6, xmm0, xmm0
|
||||
INSTR xmm7, xmm1, xmm1
|
||||
INSTR xmm8, xmm2, xmm2
|
||||
INSTR xmm9, xmm0, xmm0
|
||||
INSTR xmm10, xmm1, xmm1
|
||||
INSTR xmm11, xmm2, xmm2
|
||||
INSTR xmm12, xmm0, xmm0
|
||||
INSTR xmm13, xmm1, xmm1
|
||||
INSTR xmm14, xmm2, xmm2
|
||||
INSTR xmm15, xmm0, xmm0
|
||||
INSTR xmm16, xmm1, xmm1
|
||||
INSTR xmm17, xmm2, xmm2
|
||||
INSTR xmm18, xmm0, xmm0
|
||||
INSTR xmm19, xmm1, xmm1
|
||||
INSTR xmm20, xmm2, xmm2
|
||||
INSTR xmm21, xmm0, xmm0
|
||||
INSTR xmm22, xmm1, xmm1
|
||||
INSTR xmm23, xmm2, xmm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vaddpd-xmmxmmxmm.S
Normal file
65
testcases/vaddpd-xmmxmmxmm.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vaddpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vaddsd-xmmxmmxmm-TP.S
Normal file
65
testcases/vaddsd-xmmxmmxmm-TP.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vaddsd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm6, xmm0, xmm0
|
||||
INSTR xmm7, xmm1, xmm1
|
||||
INSTR xmm8, xmm2, xmm2
|
||||
INSTR xmm9, xmm0, xmm0
|
||||
INSTR xmm10, xmm1, xmm1
|
||||
INSTR xmm11, xmm2, xmm2
|
||||
INSTR xmm12, xmm0, xmm0
|
||||
INSTR xmm13, xmm1, xmm1
|
||||
INSTR xmm14, xmm2, xmm2
|
||||
INSTR xmm15, xmm0, xmm0
|
||||
INSTR xmm16, xmm1, xmm1
|
||||
INSTR xmm17, xmm2, xmm2
|
||||
INSTR xmm18, xmm0, xmm0
|
||||
INSTR xmm19, xmm1, xmm1
|
||||
INSTR xmm20, xmm2, xmm2
|
||||
INSTR xmm21, xmm0, xmm0
|
||||
INSTR xmm22, xmm1, xmm1
|
||||
INSTR xmm23, xmm2, xmm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vaddsd-xmmxmmxmm.S
Normal file
65
testcases/vaddsd-xmmxmmxmm.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vaddsd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
67
testcases/vmovapd-avx-ymmymm-TP.S
Normal file
67
testcases/vmovapd-avx-ymmymm-TP.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR vmovapd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm0
|
||||
# Create DP 2.0
|
||||
vaddpd ymm1, ymm1, ymm1
|
||||
# Create DP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0
|
||||
INSTR ymm4, ymm1
|
||||
INSTR ymm5, ymm2
|
||||
INSTR ymm3, ymm0
|
||||
INSTR ymm4, ymm1
|
||||
INSTR ymm5, ymm2
|
||||
INSTR ymm6, ymm0
|
||||
INSTR ymm7, ymm1
|
||||
INSTR ymm8, ymm2
|
||||
INSTR ymm9, ymm0
|
||||
INSTR ymm10, ymm1
|
||||
INSTR ymm11, ymm2
|
||||
INSTR ymm12, ymm0
|
||||
INSTR ymm13, ymm1
|
||||
INSTR ymm14, ymm2
|
||||
INSTR ymm15, ymm0
|
||||
INSTR ymm16, ymm1
|
||||
INSTR ymm17, ymm2
|
||||
INSTR ymm18, ymm0
|
||||
INSTR ymm19, ymm1
|
||||
INSTR ymm20, ymm2
|
||||
INSTR ymm21, ymm0
|
||||
INSTR ymm22, ymm1
|
||||
INSTR ymm23, ymm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
67
testcases/vmovapd-avx-ymmymm.S
Normal file
67
testcases/vmovapd-avx-ymmymm.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR vmovapd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm0
|
||||
# Create DP 2.0
|
||||
vaddpd ymm1, ymm1, ymm1
|
||||
# Create DP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm1
|
||||
INSTR ymm1, ymm0
|
||||
INSTR ymm0, ymm1
|
||||
INSTR ymm1, ymm0
|
||||
INSTR ymm0, ymm1
|
||||
INSTR ymm1, ymm0
|
||||
INSTR ymm0, ymm1
|
||||
INSTR ymm1, ymm0
|
||||
INSTR ymm0, ymm1
|
||||
INSTR ymm1, ymm0
|
||||
INSTR ymm0, ymm1
|
||||
INSTR ymm1, ymm0
|
||||
INSTR ymm0, ymm1
|
||||
INSTR ymm1, ymm0
|
||||
INSTR ymm0, ymm1
|
||||
INSTR ymm1, ymm0
|
||||
INSTR ymm0, ymm1
|
||||
INSTR ymm1, ymm0
|
||||
INSTR ymm0, ymm1
|
||||
INSTR ymm1, ymm0
|
||||
INSTR ymm0, ymm1
|
||||
INSTR ymm1, ymm0
|
||||
INSTR ymm0, ymm1
|
||||
INSTR ymm1, ymm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vmovapd-xmmxmm-TP.S
Normal file
65
testcases/vmovapd-xmmxmm-TP.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vmovapd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0
|
||||
INSTR xmm4, xmm1
|
||||
INSTR xmm5, xmm2
|
||||
INSTR xmm3, xmm0
|
||||
INSTR xmm4, xmm1
|
||||
INSTR xmm5, xmm2
|
||||
INSTR xmm6, xmm0
|
||||
INSTR xmm7, xmm1
|
||||
INSTR xmm8, xmm2
|
||||
INSTR xmm9, xmm0
|
||||
INSTR xmm10, xmm1
|
||||
INSTR xmm11, xmm2
|
||||
INSTR xmm12, xmm0
|
||||
INSTR xmm13, xmm1
|
||||
INSTR xmm14, xmm2
|
||||
INSTR xmm15, xmm0
|
||||
INSTR xmm16, xmm1
|
||||
INSTR xmm17, xmm2
|
||||
INSTR xmm18, xmm0
|
||||
INSTR xmm19, xmm1
|
||||
INSTR xmm20, xmm2
|
||||
INSTR xmm21, xmm0
|
||||
INSTR xmm22, xmm1
|
||||
INSTR xmm23, xmm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vmovapd-xmmxmm.S
Normal file
65
testcases/vmovapd-xmmxmm.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vmovapd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vmovaps-xmmxmm-TP.S
Normal file
65
testcases/vmovaps-xmmxmm-TP.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vmovaps
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0
|
||||
INSTR xmm4, xmm1
|
||||
INSTR xmm5, xmm2
|
||||
INSTR xmm3, xmm0
|
||||
INSTR xmm4, xmm1
|
||||
INSTR xmm5, xmm2
|
||||
INSTR xmm6, xmm0
|
||||
INSTR xmm7, xmm1
|
||||
INSTR xmm8, xmm2
|
||||
INSTR xmm9, xmm0
|
||||
INSTR xmm10, xmm1
|
||||
INSTR xmm11, xmm2
|
||||
INSTR xmm12, xmm0
|
||||
INSTR xmm13, xmm1
|
||||
INSTR xmm14, xmm2
|
||||
INSTR xmm15, xmm0
|
||||
INSTR xmm16, xmm1
|
||||
INSTR xmm17, xmm2
|
||||
INSTR xmm18, xmm0
|
||||
INSTR xmm19, xmm1
|
||||
INSTR xmm20, xmm2
|
||||
INSTR xmm21, xmm0
|
||||
INSTR xmm22, xmm1
|
||||
INSTR xmm23, xmm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vmovaps-xmmxmm.S
Normal file
65
testcases/vmovaps-xmmxmm.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vmovaps
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
INSTR xmm0, xmm1
|
||||
INSTR xmm1, xmm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
98
testcases/vmovq-rxmm-TP.S
Normal file
98
testcases/vmovq-rxmm-TP.S
Normal file
@@ -0,0 +1,98 @@
|
||||
#define INSTR vmovq
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR rdx, xmm0
|
||||
INSTR r9, xmm1
|
||||
INSTR r10, xmm2
|
||||
INSTR rdx, xmm0
|
||||
INSTR r9, xmm1
|
||||
INSTR r10, xmm2
|
||||
INSTR r11, xmm0
|
||||
INSTR r12, xmm1
|
||||
INSTR r13, xmm2
|
||||
INSTR r14, xmm0
|
||||
INSTR r15, xmm1
|
||||
INSTR rax, xmm2
|
||||
INSTR rbx, xmm0
|
||||
INSTR rcx, xmm1
|
||||
INSTR rdx, xmm2
|
||||
INSTR r9, xmm0
|
||||
INSTR r10, xmm1
|
||||
INSTR r11, xmm2
|
||||
INSTR r12, xmm0
|
||||
INSTR r13, xmm1
|
||||
INSTR r14, xmm2
|
||||
INSTR r15, xmm0
|
||||
INSTR rax, xmm1
|
||||
INSTR rbx, xmm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
98
testcases/vmovq-rxmm.S
Normal file
98
testcases/vmovq-rxmm.S
Normal file
@@ -0,0 +1,98 @@
|
||||
#define INSTR vmovq
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
INSTR rax, xmm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/vmovq-xmmr-TP.S
Normal file
100
testcases/vmovq-xmmr-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR vmovq
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, rax
|
||||
INSTR xmm4, rbx
|
||||
INSTR xmm5, rcx
|
||||
INSTR xmm3, rax
|
||||
INSTR xmm4, rbx
|
||||
INSTR xmm5, rcx
|
||||
INSTR xmm6, rax
|
||||
INSTR xmm7, rbx
|
||||
INSTR xmm8, rcx
|
||||
INSTR xmm9, rax
|
||||
INSTR xmm10, rbx
|
||||
INSTR xmm11, rcx
|
||||
INSTR xmm12, rax
|
||||
INSTR xmm13, rbx
|
||||
INSTR xmm14, rcx
|
||||
INSTR xmm15, rax
|
||||
INSTR xmm16, rbx
|
||||
INSTR xmm17, rcx
|
||||
INSTR xmm18, rax
|
||||
INSTR xmm19, rbx
|
||||
INSTR xmm20, rcx
|
||||
INSTR xmm21, rax
|
||||
INSTR xmm22, rbx
|
||||
INSTR xmm23, rcx
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/vmovq-xmmr.S
Normal file
100
testcases/vmovq-xmmr.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR vmovq
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
INSTR xmm0, rax
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vmovsd-xmmxmmxmm-TP.S
Normal file
65
testcases/vmovsd-xmmxmmxmm-TP.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vmovsd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm6, xmm0, xmm0
|
||||
INSTR xmm7, xmm1, xmm1
|
||||
INSTR xmm8, xmm2, xmm2
|
||||
INSTR xmm9, xmm0, xmm0
|
||||
INSTR xmm10, xmm1, xmm1
|
||||
INSTR xmm11, xmm2, xmm2
|
||||
INSTR xmm12, xmm0, xmm0
|
||||
INSTR xmm13, xmm1, xmm1
|
||||
INSTR xmm14, xmm2, xmm2
|
||||
INSTR xmm15, xmm0, xmm0
|
||||
INSTR xmm16, xmm1, xmm1
|
||||
INSTR xmm17, xmm2, xmm2
|
||||
INSTR xmm18, xmm0, xmm0
|
||||
INSTR xmm19, xmm1, xmm1
|
||||
INSTR xmm20, xmm2, xmm2
|
||||
INSTR xmm21, xmm0, xmm0
|
||||
INSTR xmm22, xmm1, xmm1
|
||||
INSTR xmm23, xmm2, xmm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vmovsd-xmmxmmxmm.S
Normal file
65
testcases/vmovsd-xmmxmmxmm.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vmovsd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
67
testcases/vmulpd-avx-ymmymmymm-TP.S
Normal file
67
testcases/vmulpd-avx-ymmymmymm-TP.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR vmulpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm0
|
||||
# Create DP 2.0
|
||||
vaddpd ymm1, ymm1, ymm1
|
||||
# Create DP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm0
|
||||
INSTR ymm4, ymm1, ymm1
|
||||
INSTR ymm5, ymm2, ymm2
|
||||
INSTR ymm3, ymm0, ymm0
|
||||
INSTR ymm4, ymm1, ymm1
|
||||
INSTR ymm5, ymm2, ymm2
|
||||
INSTR ymm6, ymm0, ymm0
|
||||
INSTR ymm7, ymm1, ymm1
|
||||
INSTR ymm8, ymm2, ymm2
|
||||
INSTR ymm9, ymm0, ymm0
|
||||
INSTR ymm10, ymm1, ymm1
|
||||
INSTR ymm11, ymm2, ymm2
|
||||
INSTR ymm12, ymm0, ymm0
|
||||
INSTR ymm13, ymm1, ymm1
|
||||
INSTR ymm14, ymm2, ymm2
|
||||
INSTR ymm15, ymm0, ymm0
|
||||
INSTR ymm16, ymm1, ymm1
|
||||
INSTR ymm17, ymm2, ymm2
|
||||
INSTR ymm18, ymm0, ymm0
|
||||
INSTR ymm19, ymm1, ymm1
|
||||
INSTR ymm20, ymm2, ymm2
|
||||
INSTR ymm21, ymm0, ymm0
|
||||
INSTR ymm22, ymm1, ymm1
|
||||
INSTR ymm23, ymm2, ymm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
67
testcases/vmulpd-avx-ymmymmymm.S
Normal file
67
testcases/vmulpd-avx-ymmymmymm.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR vmulpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm0
|
||||
# Create DP 2.0
|
||||
vaddpd ymm1, ymm1, ymm1
|
||||
# Create DP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vmulsd-xmmxmmxmm-TP.S
Normal file
65
testcases/vmulsd-xmmxmmxmm-TP.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vmulsd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm6, xmm0, xmm0
|
||||
INSTR xmm7, xmm1, xmm1
|
||||
INSTR xmm8, xmm2, xmm2
|
||||
INSTR xmm9, xmm0, xmm0
|
||||
INSTR xmm10, xmm1, xmm1
|
||||
INSTR xmm11, xmm2, xmm2
|
||||
INSTR xmm12, xmm0, xmm0
|
||||
INSTR xmm13, xmm1, xmm1
|
||||
INSTR xmm14, xmm2, xmm2
|
||||
INSTR xmm15, xmm0, xmm0
|
||||
INSTR xmm16, xmm1, xmm1
|
||||
INSTR xmm17, xmm2, xmm2
|
||||
INSTR xmm18, xmm0, xmm0
|
||||
INSTR xmm19, xmm1, xmm1
|
||||
INSTR xmm20, xmm2, xmm2
|
||||
INSTR xmm21, xmm0, xmm0
|
||||
INSTR xmm22, xmm1, xmm1
|
||||
INSTR xmm23, xmm2, xmm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vmulsd-xmmxmmxmm.S
Normal file
65
testcases/vmulsd-xmmxmmxmm.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vmulsd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
67
testcases/vsubpd-avx-ymmymmymm-TP.S
Normal file
67
testcases/vsubpd-avx-ymmymmymm-TP.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR vsubpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm0
|
||||
# Create DP 2.0
|
||||
vaddpd ymm1, ymm1, ymm1
|
||||
# Create DP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm0
|
||||
INSTR ymm4, ymm1, ymm1
|
||||
INSTR ymm5, ymm2, ymm2
|
||||
INSTR ymm3, ymm0, ymm0
|
||||
INSTR ymm4, ymm1, ymm1
|
||||
INSTR ymm5, ymm2, ymm2
|
||||
INSTR ymm6, ymm0, ymm0
|
||||
INSTR ymm7, ymm1, ymm1
|
||||
INSTR ymm8, ymm2, ymm2
|
||||
INSTR ymm9, ymm0, ymm0
|
||||
INSTR ymm10, ymm1, ymm1
|
||||
INSTR ymm11, ymm2, ymm2
|
||||
INSTR ymm12, ymm0, ymm0
|
||||
INSTR ymm13, ymm1, ymm1
|
||||
INSTR ymm14, ymm2, ymm2
|
||||
INSTR ymm15, ymm0, ymm0
|
||||
INSTR ymm16, ymm1, ymm1
|
||||
INSTR ymm17, ymm2, ymm2
|
||||
INSTR ymm18, ymm0, ymm0
|
||||
INSTR ymm19, ymm1, ymm1
|
||||
INSTR ymm20, ymm2, ymm2
|
||||
INSTR ymm21, ymm0, ymm0
|
||||
INSTR ymm22, ymm1, ymm1
|
||||
INSTR ymm23, ymm2, ymm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
67
testcases/vsubpd-avx-ymmymmymm.S
Normal file
67
testcases/vsubpd-avx-ymmymmymm.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR vsubpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm0
|
||||
# Create DP 2.0
|
||||
vaddpd ymm1, ymm1, ymm1
|
||||
# Create DP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vsubsd-xmmxmmxmm-TP.S
Normal file
65
testcases/vsubsd-xmmxmmxmm-TP.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vsubsd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm6, xmm0, xmm0
|
||||
INSTR xmm7, xmm1, xmm1
|
||||
INSTR xmm8, xmm2, xmm2
|
||||
INSTR xmm9, xmm0, xmm0
|
||||
INSTR xmm10, xmm1, xmm1
|
||||
INSTR xmm11, xmm2, xmm2
|
||||
INSTR xmm12, xmm0, xmm0
|
||||
INSTR xmm13, xmm1, xmm1
|
||||
INSTR xmm14, xmm2, xmm2
|
||||
INSTR xmm15, xmm0, xmm0
|
||||
INSTR xmm16, xmm1, xmm1
|
||||
INSTR xmm17, xmm2, xmm2
|
||||
INSTR xmm18, xmm0, xmm0
|
||||
INSTR xmm19, xmm1, xmm1
|
||||
INSTR xmm20, xmm2, xmm2
|
||||
INSTR xmm21, xmm0, xmm0
|
||||
INSTR xmm22, xmm1, xmm1
|
||||
INSTR xmm23, xmm2, xmm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vsubsd-xmmxmmxmm.S
Normal file
65
testcases/vsubsd-xmmxmmxmm.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vsubsd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vunpckhpd-xmmxmmxmm-TP.S
Normal file
65
testcases/vunpckhpd-xmmxmmxmm-TP.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vunpckhpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm6, xmm0, xmm0
|
||||
INSTR xmm7, xmm1, xmm1
|
||||
INSTR xmm8, xmm2, xmm2
|
||||
INSTR xmm9, xmm0, xmm0
|
||||
INSTR xmm10, xmm1, xmm1
|
||||
INSTR xmm11, xmm2, xmm2
|
||||
INSTR xmm12, xmm0, xmm0
|
||||
INSTR xmm13, xmm1, xmm1
|
||||
INSTR xmm14, xmm2, xmm2
|
||||
INSTR xmm15, xmm0, xmm0
|
||||
INSTR xmm16, xmm1, xmm1
|
||||
INSTR xmm17, xmm2, xmm2
|
||||
INSTR xmm18, xmm0, xmm0
|
||||
INSTR xmm19, xmm1, xmm1
|
||||
INSTR xmm20, xmm2, xmm2
|
||||
INSTR xmm21, xmm0, xmm0
|
||||
INSTR xmm22, xmm1, xmm1
|
||||
INSTR xmm23, xmm2, xmm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vunpckhpd-xmmxmmxmm.S
Normal file
65
testcases/vunpckhpd-xmmxmmxmm.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vunpckhpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
67
testcases/vxorpd-avx-ymmymmymm-TP.S
Normal file
67
testcases/vxorpd-avx-ymmymmymm-TP.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR vxorpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm0
|
||||
# Create DP 2.0
|
||||
vaddpd ymm1, ymm1, ymm1
|
||||
# Create DP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm3, ymm0, ymm0
|
||||
INSTR ymm4, ymm1, ymm1
|
||||
INSTR ymm5, ymm2, ymm2
|
||||
INSTR ymm3, ymm0, ymm0
|
||||
INSTR ymm4, ymm1, ymm1
|
||||
INSTR ymm5, ymm2, ymm2
|
||||
INSTR ymm6, ymm0, ymm0
|
||||
INSTR ymm7, ymm1, ymm1
|
||||
INSTR ymm8, ymm2, ymm2
|
||||
INSTR ymm9, ymm0, ymm0
|
||||
INSTR ymm10, ymm1, ymm1
|
||||
INSTR ymm11, ymm2, ymm2
|
||||
INSTR ymm12, ymm0, ymm0
|
||||
INSTR ymm13, ymm1, ymm1
|
||||
INSTR ymm14, ymm2, ymm2
|
||||
INSTR ymm15, ymm0, ymm0
|
||||
INSTR ymm16, ymm1, ymm1
|
||||
INSTR ymm17, ymm2, ymm2
|
||||
INSTR ymm18, ymm0, ymm0
|
||||
INSTR ymm19, ymm1, ymm1
|
||||
INSTR ymm20, ymm2, ymm2
|
||||
INSTR ymm21, ymm0, ymm0
|
||||
INSTR ymm22, ymm1, ymm1
|
||||
INSTR ymm23, ymm2, ymm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
67
testcases/vxorpd-avx-ymmymmymm.S
Normal file
67
testcases/vxorpd-avx-ymmymmymm.S
Normal file
@@ -0,0 +1,67 @@
|
||||
#define INSTR vxorpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# expand from SSE to AVX
|
||||
vinsertf128 ymm0, ymm0, xmm0, 0x1
|
||||
# copy DP 1.0
|
||||
vmovaps ymm0, ymm0
|
||||
vmovaps ymm1, ymm0
|
||||
# Create DP 2.0
|
||||
vaddpd ymm1, ymm1, ymm1
|
||||
# Create DP 0.5
|
||||
vdivpd ymm2, ymm0, ymm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
INSTR ymm0, ymm1, ymm0
|
||||
INSTR ymm1, ymm0, ymm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vxorpd-xmmxmmxmm-TP.S
Normal file
65
testcases/vxorpd-xmmxmmxmm-TP.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vxorpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm6, xmm0, xmm0
|
||||
INSTR xmm7, xmm1, xmm1
|
||||
INSTR xmm8, xmm2, xmm2
|
||||
INSTR xmm9, xmm0, xmm0
|
||||
INSTR xmm10, xmm1, xmm1
|
||||
INSTR xmm11, xmm2, xmm2
|
||||
INSTR xmm12, xmm0, xmm0
|
||||
INSTR xmm13, xmm1, xmm1
|
||||
INSTR xmm14, xmm2, xmm2
|
||||
INSTR xmm15, xmm0, xmm0
|
||||
INSTR xmm16, xmm1, xmm1
|
||||
INSTR xmm17, xmm2, xmm2
|
||||
INSTR xmm18, xmm0, xmm0
|
||||
INSTR xmm19, xmm1, xmm1
|
||||
INSTR xmm20, xmm2, xmm2
|
||||
INSTR xmm21, xmm0, xmm0
|
||||
INSTR xmm22, xmm1, xmm1
|
||||
INSTR xmm23, xmm2, xmm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vxorpd-xmmxmmxmm.S
Normal file
65
testcases/vxorpd-xmmxmmxmm.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vxorpd
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vxorps-xmmxmmxmm-TP.S
Normal file
65
testcases/vxorps-xmmxmmxmm-TP.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vxorps
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm3, xmm0, xmm0
|
||||
INSTR xmm4, xmm1, xmm1
|
||||
INSTR xmm5, xmm2, xmm2
|
||||
INSTR xmm6, xmm0, xmm0
|
||||
INSTR xmm7, xmm1, xmm1
|
||||
INSTR xmm8, xmm2, xmm2
|
||||
INSTR xmm9, xmm0, xmm0
|
||||
INSTR xmm10, xmm1, xmm1
|
||||
INSTR xmm11, xmm2, xmm2
|
||||
INSTR xmm12, xmm0, xmm0
|
||||
INSTR xmm13, xmm1, xmm1
|
||||
INSTR xmm14, xmm2, xmm2
|
||||
INSTR xmm15, xmm0, xmm0
|
||||
INSTR xmm16, xmm1, xmm1
|
||||
INSTR xmm17, xmm2, xmm2
|
||||
INSTR xmm18, xmm0, xmm0
|
||||
INSTR xmm19, xmm1, xmm1
|
||||
INSTR xmm20, xmm2, xmm2
|
||||
INSTR xmm21, xmm0, xmm0
|
||||
INSTR xmm22, xmm1, xmm1
|
||||
INSTR xmm23, xmm2, xmm2
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
65
testcases/vxorps-xmmxmmxmm.S
Normal file
65
testcases/vxorps-xmmxmmxmm.S
Normal file
@@ -0,0 +1,65 @@
|
||||
#define INSTR vxorps
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
# copy DP 1.0
|
||||
vmovaps xmm0, xmm0
|
||||
vmovaps xmm1, xmm0
|
||||
# Create DP 2.0
|
||||
vaddpd xmm1, xmm1, xmm1
|
||||
# Create DP 0.5
|
||||
vdivpd xmm2, xmm0, xmm1
|
||||
loop:
|
||||
inc i
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
INSTR xmm0, xmm1, xmm0
|
||||
INSTR xmm1, xmm0, xmm0
|
||||
cmp i, N
|
||||
jl loop
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/xor-rr-TP.S
Normal file
100
testcases/xor-rr-TP.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR xor
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR dl, al
|
||||
INSTR r9l, bl
|
||||
INSTR r10l, cl
|
||||
INSTR dl, al
|
||||
INSTR r9l, bl
|
||||
INSTR r10l, cl
|
||||
INSTR r11l, al
|
||||
INSTR r12l, bl
|
||||
INSTR r13l, cl
|
||||
INSTR r14l, al
|
||||
INSTR r15l, bl
|
||||
INSTR al, cl
|
||||
INSTR bl, al
|
||||
INSTR cl, bl
|
||||
INSTR dl, cl
|
||||
INSTR r9l, al
|
||||
INSTR r10l, bl
|
||||
INSTR r11l, cl
|
||||
INSTR r12l, al
|
||||
INSTR r13l, bl
|
||||
INSTR r14l, cl
|
||||
INSTR r15l, al
|
||||
INSTR al, bl
|
||||
INSTR bl, cl
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
100
testcases/xor-rr.S
Normal file
100
testcases/xor-rr.S
Normal file
@@ -0,0 +1,100 @@
|
||||
#define INSTR xor
|
||||
#define NINST 24
|
||||
#define N edi
|
||||
#define i r8d
|
||||
|
||||
|
||||
.intel_syntax noprefix
|
||||
.globl ninst
|
||||
.data
|
||||
ninst:
|
||||
.long NINST
|
||||
.text
|
||||
.globl latency
|
||||
.type latency, @function
|
||||
.align 32
|
||||
latency:
|
||||
push rbp
|
||||
mov rbp, rsp
|
||||
xor i, i
|
||||
test N, N
|
||||
jle done
|
||||
# create DP 1.0
|
||||
vpcmpeqw xmm0, xmm0, xmm0 # all ones
|
||||
vpsllq xmm0, xmm0, 54 # logical left shift: 11111110..0 (54=64-(10-1))
|
||||
vpsrlq xmm0, xmm0, 2 # logical right shift: 1 bit for sign; leading mantissa bit is zero
|
||||
push rax
|
||||
push rbx
|
||||
push rcx
|
||||
push rdx
|
||||
push r9
|
||||
push r10
|
||||
push r11
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
xor rax, rax
|
||||
xor rbx, rbx
|
||||
xor rcx, rcx
|
||||
xor rdx, rdx
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
# copy DP 1.0
|
||||
vmovq rax, xmm0
|
||||
vmovq rbx, xmm0
|
||||
# Create DP 2.0
|
||||
add rbx, rax
|
||||
# Create DP 0.5
|
||||
div rax
|
||||
movq rcx, rax
|
||||
vmovq rax, xmm0
|
||||
loop:
|
||||
inc i
|
||||
INSTR al, bl
|
||||
INSTR bl, al
|
||||
INSTR al, bl
|
||||
INSTR bl, al
|
||||
INSTR al, bl
|
||||
INSTR bl, al
|
||||
INSTR al, bl
|
||||
INSTR bl, al
|
||||
INSTR al, bl
|
||||
INSTR bl, al
|
||||
INSTR al, bl
|
||||
INSTR bl, al
|
||||
INSTR al, bl
|
||||
INSTR bl, al
|
||||
INSTR al, bl
|
||||
INSTR bl, al
|
||||
INSTR al, bl
|
||||
INSTR bl, al
|
||||
INSTR al, bl
|
||||
INSTR bl, al
|
||||
INSTR al, bl
|
||||
INSTR bl, al
|
||||
INSTR al, bl
|
||||
INSTR bl, al
|
||||
cmp i, N
|
||||
jl loop
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop r11
|
||||
pop r10
|
||||
pop r9
|
||||
pop rdx
|
||||
pop rcx
|
||||
pop rbx
|
||||
pop rax
|
||||
done:
|
||||
mov rsp, rbp
|
||||
pop rbp
|
||||
ret
|
||||
.size latency, .-latency
|
||||
Reference in New Issue
Block a user