Nil Satis Nisi Optimum - Logoszféra fórum

üzenetek

hozzászólások


P.H.
(senior tag)

Nincs többé külön K10-nek tetsző és külön Core2-nek tetsző verzió: egyetlen van, amely mindkettőn jó, kb. 0.5% veszteséggel.

Core2 (2.5 GHz): 58 sec alatt megoldja a feladatot
K10 (2.9 GHz): 44 sec alatt megoldja a feladatot
Prescott (2.26 GHz): 60 sec alatt 220000 mátrix
Northwood (2.4 GHz): 60 sec alatt 248000 mátrix

{@04-} { x1 } movsx ebx,byte ptr es:[edx]
{1-} mov esi,ebp
{2-} xor eax,eax
{0} and esi,-8
@init:
{@0F} mov [edi+esi*08h+(00h*08h)+__0STARROW],eax
{1} mov [edi+esi*08h+(04h*08h)+__0STARROW],eax
{2*} add esi,08h
{0*} jnz @init { clears ESI register }
{ } add edx,01h
{ -} mov ecx,ebp
@@ARGUMENT: { K10:2.6 Core2:2.9 - 3.3 uop/clk - 1640*2+6550 }
{@20} cmp ebx,esi { 4 AGU + 9 EX uops on Kaveri }
{1} lea eax,[ebp+ebx*04h+00h] { 3 clk 8 ALU ops on Core 2 }
{2} movsx ebx,byte ptr [edx]
{0} lea edx,[edx+01h] { db $8D,$52,$00 }
{1} mov [edi+eax*08h+__0STARROW],ebp { __0COUNTER <- EBP }
{2} cmovs eax,esi
{0} mov [edi+ecx*08h+__FIXEDROW],eax
{1*} add ecx,04h
{2*} jnz @@ARGUMENT { clears ECX register }
{ -} { x2 } xor ecx,ecx
{ -} mov eax,edi
{ -} push ebp
{@40-} lea edx,[ebp-04h]
@@REDUCE_ROWS:
{@43} mov [edi+edx*08h+__ROWMODIFIER],ecx
{1} mov esi,[edi+edx*08h+(04h*08h)+__FIXEDROW]
{2*} add edx,04h
{0*} jz @@REDUCE_COLUMNS
{@50} mov [edi+edx*08h+__0STAR],esi
{2-} xor ecx,ecx
{0} sub eax,ebp
{1**} test esi,esi { JS/JNS can only fuse with TEST }
{2**} js @@REDUCE_ROWS
{ -} mov ebx,ebp { EBX < 0 for even minimum }
{ } mov ecx,[eax+ebp]
{@61} or ecx,[edi+ebp*08h+__0STARROW]
{ } and ebp,04h
{ } add ebp,ebx
{@69} @findrowmin: { K10:2.8 Core2:2.2 - 2.6 uop/clk - 1100*2+5000 }
{0} mov esi,[eax+ebp] { 4 AGU + 8 EX uops on Kaveri }
{1} or esi,[edi+ebp*08h+(00h*08h)+__0STARROW] { 3 clk 10 ALU ops on Core 2 }
{2} add ebp,08h
{@72} cmp esi,ebx
{1} cmovb ebx,esi
{2} mov esi,[eax+ebp-04h]
{0} or esi,[edi+ebp*08h-(04h*08h)+__0STARROW]
{1} cmp esi,ecx
{@81} cmovb ecx,esi
{0**} test ebp,ebp
{1**} jnz @findrowmin
{ } mov ebp,[esp+00h]
{ } cmp ebx,ecx
{ } cmovb ecx,ebx
{@90} neg ecx
{ } jle @@REDUCE_ROWS
{ -} { x1 } nop
@@ABNORMAL_EXIT:
{@95} pop eax
{1} { x3 } or edx,0FFFFFFFFh
{2} mov esi,[esp+__MARKS]
{@A0} mov [esi+TRESULT.OPTIMUM],edx
{1} mov ebx,[esi+TRESULT.NEXTIVALUE]
{2} jmp dword ptr [esp+_INVALIDRESULT]
{ x1 } nop
{@AB} @init0col:
{0} mov [edi+__INITCOL],ecx
{1-} mov esi,ebp
{@B0} neg ebp
{0} mov ebx,0FFFFFFFFh
{1*} sub ecx,04h
{2*} jnz @@1ST_STEP { long jump instruction } { forced conditional jump for Sandy Bridge }
{@C0} { x2 } xor eax,eax
{@C2} @free0col:
{ } mov [edi+edx*08h+__COLMODIFIER],esi { no need to initialize __COLMODIFIER of fixed column }
{ -} mov ecx,edx
{@C8} @next0col:
{ **} cmp edx,ebp
{ **} jz @init0col
@@REDUCE_COLUMNS: { no need to initialize -initcol in ECX }
{0} mov eax,[edi+edx*08h-(04h*08h)+__0STARROW]
{@D0} sub edx,04h
{2*} sub eax,01h
{0*} jnc @next0col
{ } { x1 } lea ebx,es:[edi+edx]
{ -} mov ecx,ebp
{ } sub ebx,ebp
{@E0} @findcolmin: { K10:3.0 Core2:_._ - _._ uop/clk - ____*2+____
{0} mov esi,[ebx] { 3 AGU + 8 EX uops on Kaveri }
{1} add esi,[edi+ecx*08h+__ROWMODIFIER] { 3 clk 9 ALU ops on Core 2 }
{2} or esi,[edi+ecx*08h+__FIXEDROW]
{0} jz @test0row
{1} sub ebx,ebp
{2} cmp esi,eax
{@F0} cmovb eax,esi
{1*} add ecx,04h
{2*} jnz @findcolmin
{ } lea ecx,[ebp-04h]
{ -} mov esi,eax
{ } lea ebx,[edi+edx]
{@00**} test eax,eax { JS/JNS can only fuse with TEST }
{ **} js @@ABNORMAL_EXIT
{@04} @seekcol0:
{0} mov eax,[edi+ecx*08h+(04h*08h)+__ROWMODIFIER]
{1*} add ecx,04h
{2*} jz @free0col
{0} sub ebx,ebp
{1} add eax,[ebx]
{@11**} cmp eax,esi { maximum data value = 00FFFFFFh -> marked elements stay negative }
{0**} jnz @seekcol0
@test0row:
{ **} test [edi+ecx*08h+__0STAR],ebp { JS/JNS can only fuse with TEST }
{ **} js @seekcol0
{ } mov [edi+edx*08h+__0STARROW],ecx
{@1E} mov [edi+ecx*08h+__0STAR],edx
{@22} jns @free0col { forced conditional jump for Sandy Bridge }
{ ----------------------------------------------------------------------------------------------- }
{@24} { x12 } mov eax,00000000h; mov edx,00000000h; xor ecx,ecx
{@30} { x5 } mov esi,00000000h
@@5TH_STEP: { K10:2.2 Core2:2.2 - 2.7 uop/clk - 3050*2+3700 }
{@35} movsx esi,word ptr [edi+__MINCOLROW+00h]
{ } sub ebx,ebp
{ } movsx eax,word ptr [edi+ebx*08h+__SIGN-(04h*08h)+__COLMARK]
{@40} @5th_step: { 5 AGU + 11 EX uops on Kaveri }
{0} movsx ecx,word ptr [edi+ebx*08h+__SIGN+__0COLON___ROWMARK] { 4 clk 6 ALU ops on Core 2 }
{1} mov [edi+ebx*08h-(04h*08h)+__COLMARK],eax
{2} and eax,edx
{0} add [edi+ebx*08h+__COLMODIFIER],eax
{@4F} and ecx,edx
{2} movsx eax,word ptr [edi+ebx*08h+__SIGN-(04h*08h)+(04h*08h)+__COLMARK] { __MINCOLROW col }
{0} add [edi+ebx*08h+__ROWMODIFIER],ecx
{1*} add ebx,04h
{2*} jnz @5th_step { clears EBX register }
{@5F} mov ecx,[edi+__INITCOL]
{1-} mov edx,esi
{2} mov esi,[edi+esi*08h+__0STAR]
{0**} test esi,esi
{1**} jz @@4TH_STEP { long jump instruction }
{@70} mov [edi+edx*08h+__0COLON___ROWMARK],eax { set row mark }
{0} mov dword ptr [edi+esi*08h-(04h*08h)+__COLMARK],-1 { unmark column with -1 }
{1} jmp @test2col
{ x2 } xor esi,esi
{@80} { x2 } xor eax,eax
{@82} @fast2forward:
{ *} add ebx,04h
{ *} jnz @continue
{@87} @pass2col:
{ } mov [edi+ecx*08h-(04h*08h)+__COLMARK],ecx { re-mark column with its index != -1 }
{@8B} @next2col:
{0*} add ecx,04h
{1*} jz @@5TH_STEP { clears ECX register }
@test2col:
{@90**} cmp [edi+ecx*08h-(04h*08h)+__COLMARK],ecx
{0**} jbe @next2col
@@2ND_STEP:
{ } sub ebx,ebp { ordered for Core2 }
{ } lea eax,[ecx+edi] { ordered for Core2 }
{@9B} @continue:
{ } mov esi,[edi+ecx*08h+__COLMODIFIER]
{ } push esi
{@A0} sal ecx,10h
{ } mov esi,[edi+ebx*08h+__ROWMODIFIER]
{@A7} @ZERO2col: { K10:3.0 Core2:2.5 - 2.9 uop/clk - 1500*2+5600 { 4 AGU + 11 EX uops on Kaveri }
{0} sub esi,[esp+00h] { 4 clk 13 ALU ops on Core 2 }
{1} add esi,[eax+ebp]
{2} lea eax,[eax+ebp]
{@B0} jo @over2flow { overflow: (-x)+(-y)=(+z) or (+x)+(+y)=(-z) }
{1} or esi,[edi+ebx*08h+__0COLON___ROWMARK]
{2} jz @@3RD_STEP
{0} cmp esi,edx
{1} cmovb edx,esi
{2} cmovb cx,bx
{@C1} @over2flow:
{0} mov esi,[edi+ebx*08h+(04h*08h)+__ROWMODIFIER]
{1*} add ebx,04h
{2*} jnz @ZERO2col { clears EBX register }
@@3RD_STEP:
{@CA} pop esi { add esp,04h } { enforces ESP tracking to AGU/load pipe on Bulldozer/Core }
{1-} mov esi,ecx
{2} sar ecx,10h
{@D0} cmovnc esi,[edi+__MINCOLROW]
{1} mov [edi+__MINCOLROW],esi
{2**} { x1 } cmp ebx,00h
{0**} jz @pass2col
{1} mov esi,[edi+ebx*08h+__0STAR]
{@E0**} test esi,esi
{0**} jz @4TH_STEP
{1} mov [edi+ebx*08h+__0COLON___ROWMARK],ecx { set row mark }
{2} mov dword ptr [edi+esi*08h-(04h*08h)+__COLMARK],-1 { unmark column with -1 }
{@F0**} cmp word ptr [edi+__MINCOLROW],bx { STORE FORWARDED }
{1**} jz @re2start
{2**} cmp esi,ecx { jb = jl for 2 negative numbers }
{0**} jae @fast2forward
{1-} xor ebx,ebx
{2-} mov ecx,esi
{0} jmp @@2ND_STEP
{@00} @re2start:
{0} mov ecx,[edi+__INITCOL]
{1-} mov edx,ebx
{2-} mov ebx,ebp
{0} neg ebx
{@09} @init2col:
{0} movsx eax,word ptr [edi+ebx*08h+__SIGN-(04h*08h)+__COLMARK]
{1} mov [edi+ebx*08h-(04h*08h)+__COLMARK],eax
{2*} add ebx,04h
{0*} jnz @init2col { clears EBX register }
{ } jmp @test2col { long jump instruction }
{ } { x4 } lea eax,[ebp+ebp+00h]
{@20} { x2 } test edi,edi
@@4TH_STEP:
{@22-} mov ecx,eax
@re4order:
{@24-} mov ebx,edx
@4TH_STEP:
{@26} mov edx,[edi+ecx*08h+__0STARROW]
{2} mov [edi+ebx*08h+__0STAR],ecx
{0} mov [edi+ecx*08h+__0STARROW],ebx
{@30} mov ecx,[edi+edx*08h+__0COLON___ROWMARK]
{2**} { x1 } cmp edx,00h
{0**} jnz @re4order { clears EDX register }
{ } sub esi,ebp
{ } sub edx,ebp
{ } lea ecx,[esi-04h]
@@1ST_STEP: { K10:2.8 Core2:2.9 - 3.2 uop/clk - 1500*2+6100 }
{@40} mov eax,[edi+esi*08h+__0STARROW] { 4 AGU + 7 EX uops on Kaveri }
{1} and ebx,eax { clears EBX at uncomplete calculation } { 3 clk 6 ALU ops on Core 2 }
{2} not eax
{0} mov [edi+esi*08h-(04h*08h)+__COLMARK],eax
{1} mov eax,[edi+esi*08h+__FIXEDROW]
{2} cmovs ecx,esi
{0} mov [edi+esi*08h+__0COLON___ROWMARK],eax
{1*} add esi,04h
{2*} jnz @@1ST_STEP { clears ESI register [NOT USED] }
{ *} add ecx,04h { long jump instruction }
{ *} jnz @@2ND_STEP { ===>>> EBX: 00h EDX:negative = -EBP ECX:initcol (>= EBP) }
{ -} mov ebx,edi { work matrix unmodified } { [esp+__SAVE] }
{ } mov esi,[esp+04h+__MARKS]
@@results:
{@6A} mov eax,[edi+edx*08h+__0STAR] { 3 AGU + 8 EX uops on Kaveri }
{1} add ebx,ebp
{@70} add ecx,[ebx+eax]
{0} add eax,ebp
{1} shr eax,02h
{2} mov [esi],al
{0} add esi,01h
{1*} add edx,04h
{@80*} jnz @@results { clears EDX register ( DL=0 as head, DH=0 as length ) }

üzenetek