Nil Satis Nisi Optimum - Logoszféra fórum

üzenetek

hozzászólások


P.H.
(senior tag)

Továbbra is egy szálas program.

Core2 (2.5 GHz): 60 sec alatt 507000 mátrix
Sandy Bridge (G1620 2.7 GHz): a K10-es ciklusverzióval 59 sec, a Core2-essel 60 sec

Akkor is meg fogja oldani 2500 MHz-en bármely Core egy szálon ezt a feladatot; ha kell, fél évig mászok fel hetente 1000 mátrixot, de meg fogja.

{@04-} { x1 } movsx ecx,byte ptr es:[edx]
{1-} xor eax,eax
{2-} mov esi,ebp
{0} and esi,-8
@init:
{@0F} mov [edi+esi*08h+(00h*08h)+__0STARROW],eax
{1} mov [edi+esi*08h+(04h*08h)+__0STARROW],eax
{2*} add esi,08h
{0*} jnz @init { clears ESI register }
{ } add edx,01h
{ -} mov ebx,ebp
@@ARGUMENT: { K10:2.6 Core2:2.9 - 3.3 uop/clk - 1640*2+6550 }
{@20} cmp ecx,esi { 4 AGU + 9 EX uops on Kaveri }
{1} lea eax,[ebx+ecx*04h] { 3 clk 8 ALU ops on Core 2 }
{2} movsx ecx,[edx]
{0} lea edx,[edx+01h] { db $8D,$52,$00 }
{1} mov [edi+eax*08h+__0STARROW],ebx { __0COUNTER <- EBP }
{2} cmovs eax,esi
{0} mov [edi+ebp*08h+__FIXEDROW],eax
{1*} add ebp,04h
{2*} jnz @@ARGUMENT { clears EBP register }
{ -} mov eax,edi
{ -} mov ebp,ebx
{ -} xor ecx,ecx
{@40} add esp,ebx
{ } lea edx,[ebx-04h]
@@REDUCE_ROWS:
{@45} mov [edi+edx*08h+__ROWMODIFIER],ecx
{1} mov esi,[edi+edx*08h+(04h*08h)+__FIXEDROW]
{2*} add edx,04h
{0*} jz @@REDUCE_COLUMNS
{1} mov [edi+edx*08h+__0STAR],esi
{2-} xor ecx,ecx
{0} sub eax,ebp
{1**} test esi,esi
{2**} js @@REDUCE_ROWS
{0-} mov ecx,ebp
{@60} @findrowmin: { 2 AGU + 5 EX uops on Kaveri }
{0} mov esi,[eax+ebx] { 2 clk 6 ALU ops on Core 2 }
{1} or esi,[edi+ebx*08h+__0STARROW]
{2} cmp esi,ecx
{0} cmovb ecx,esi
{1*} add ebx,04h
{2*} jnz @findrowmin
{@70-} mov ebx,ebp
{ } neg ecx
{ } jle @@REDUCE_ROWS
@@ABNORMAL_EXIT:
{@76} or edx,0FFFFFFFFh
{1} sub esp,ebp
{@7E} mov esi,[esp+__MARKS]
{0} mov [esi+TRESULT.OPTIMUM],edx
{1} mov ebx,[esi+TRESULT.NEXTIVALUE]
{2} jmp dword ptr [esp+_INVALIDRESULT]
{ x4 } xor eax,eax; xor edx,edx
{@90} @initcol:
{0} mov [edi+__INITCOL],ecx
{1-} mov esi,ebp
{2} neg ebp
{0} push ebp
{1} or ebx,-1
{2} jmp @@1ST_STEP { long jump instruction }
{@A0} { x2 } xor eax,eax
{@A2} @free0col:
{ } lea ecx,[edx-04h]
{@A5} @setcolmod:
{ } mov [edi+edx*08h+__COLMODIFIER],esi
@@REDUCE_COLUMNS: { no need to initialize -initcol in ECX }
{0**} cmp edx,ebp
{1**} jz @initcol
{0} sub edx,04h
{@B0-} xor esi,esi
{1**} test [edi+edx*08h+__0STARROW],ebp
{2**} js @setcolmod
{ } lea ebx,[edi+edx]
{ -} mov ecx,ebp
{ -} mov eax,ebp
{ } sub ebx,ebp
{@C0} @findcolmin:
{0} mov esi,[ebx] { 3 AGU + 8 EX uops on Kaveri }
{1} add esi,[edi+ecx*08h+__ROWMODIFIER] { 3 clk 9 ALU ops on Core 2 }
{2} or esi,[edi+ecx*08h+__FIXEDROW]
{0} jz @test0row
{1} sub ebx,ebp
{2} cmp esi,eax
{@D0} cmovb eax,esi
{1*} add ecx,04h
{2*} jnz @findcolmin
{ } lea ecx,[ebp-04h]
{ -} mov esi,eax
{ } lea ebx,[edi+edx]
{@E0**} test eax,eax { JS/JNS can only fuse with TEST }
{ **} js @@ABNORMAL_EXIT
{@E4} @seekcol0:
{0} mov eax,[edi+ecx*08h+(04h*08h)+__ROWMODIFIER]
{1*} add ecx,04h
{2*} jz @free0col
{0} sub ebx,ebp
{1} add eax,[ebx]
{@F1**} cmp eax,esi { maximum data value = 00FFFFFFh -> marked elements stay negative }
{0**} jnz @seekcol0
@test0row:
{ **} test [edi+ecx*08h+__0STAR],ebp
{ **} js @seekcol0
{ } mov [edi+edx*08h+__0STARROW],ecx
{@FE} mov [edi+ecx*08h+__0STAR],edx
{@02} jns @free0col { forced conditional jump for Sandy Bridge }
{ ----------------------------------------------------------------------------------------------- }
{@04} { x12 } mov eax,00000000h; mov edx,00000000h; xor ecx,ecx
{@10} { x2 } xor ebp,ebp
@@5TH_STEP: { K10:2.5 Core2:2.4 - 2.8 uop/clk - 1900*2+4800
{@12} mov eax,[edi+__INITCOL] { lea ebx,[ebp-04h] }
{1} mov esi,[esp+__SIZE]
{2} add eax,04h
{0} movsx ebx,word ptr [edi+__MINCOLROW]
{@20} @DEC5_free_col: { 3 AGU + 6 EX uops on Kaveri }
{0} mov ebp,[edi+eax*08h+__COLMARK] { 2 clk 5 ALU ops on Core 2 }
{1} sar ebp,1Fh
{2} and ebp,edx
{0} add [edi+eax*08h+__COLMODIFIER],ebp
{1*} add eax,04h
{@30*} jnz @DEC5_free_col { clears EAX register }
{ } mov eax,[esp+__SIZE+esi*04h]
{ } movsx ecx,word ptr [edi+__MINCOLROW+02h]
{ } jmp @INC5_marked_row
{ x4 } xor ebp,ebp; xor esi,esi
{@40} @inc5row:
{0} add [edi+eax*08h+__ROWMODIFIER],edx { 4 AGU + 4 EX uops on Kaveri }
{1-} mov eax,ebp
@INC5_marked_row:
{2} mov ebp,[esp+esi*04h]
{0*} sub esi,01h
{1*} jge @inc5row { sets ESI to 0FFFFFFFFh }
@@3RD_STEP:
{@4E*} and esi,[edi+ebx*08h+__0STAR]
{@52*} jz @4TH_STEP { long jump instruction }
{@58} @re3start:
{ } mov [edi+ebx*08h+__0COLON___ROWMARK],ecx { set row mark }
{ } { x1 } mov ecx,es:[edi+__INITCOL] { lea ecx,es:[ebp-04h] }
{@60-} mov edx,ebx
{@62} @mark3row:
{ } mov [esp+__OFFS+eax*04h],ebx
{ -} xor ebx,ebx
{ } mov [edi+esi*08h+__COLMARK],esi { unmark column with negative }
{ } add dword ptr [esp+__SIZE],01h
{@71} @chk2col:
{0*} add ecx,04h
{1*} jz @@5TH_STEP
{2**} test [edi+ecx*08h+__COLMARK],ecx { STORE FORWARDED from @mark3row }
{0**} jns @chk2col
@@2ND_STEP:
{12} push dword ptr [edi+ecx*08h+__COLMODIFIER]
{@80} lea eax,[ecx+edi]
{ } sub ebx,ebp
{ } sal ecx,10h
{ } mov esi,[edi+ebx*08h+__ROWMODIFIER]
{@8C} @ZERO2col: { K10:3.0 Core2:2.5 - 2.9 uop/clk - 1500*2+5600 { 4 AGU + 11 EX uops on Kaveri }
{0} sub esi,[esp+00h] { 4 clk 13 ALU ops on Core 2 }
{@8F} add esi,[eax+ebp]
{C2D} lea eax,[eax+ebp]
{2} jo @over2flow { overflow: (-x)+(-y)=(+z) or (+x)+(+y)=(-z) }
{0} or esi,[edi+ebx*08h+__0COLON___ROWMARK]
{1} jz @zero
{K10}// lea eax,[eax+ebp]
{0} cmp esi,edx
{@9F} cmovb edx,esi
{@A2} cmovb cx,bx
@over2flow:
{0} mov esi,[edi+ebx*08h+(04h*08h)+__ROWMODIFIER]
{1*} add ebx,04h
{2*} jnz @ZERO2col
{@AF} @zero:
{0} pop eax { add esp,04h } { forces ESP handling to AGU/memory pipe on Kaveri/Core }
{@B0-} mov eax,ecx
{2} sar ecx,10h
{0} cmovnc eax,[edi+__MINCOLROW]
{1} mov [edi+__MINCOLROW],eax
{2**} test ebx,ebx
{0**} jz @chk2col
{@C0*} add esi,[edi+ebx*08h+__0STAR] { zero found -> ESI=0 }
{2*} jz @4TH_STEP
{0} mov eax,[esp+__SIZE]
{1**} cmp word ptr [edi+__MINCOLROW],bx { STORE FORWARDED }
{2**} jz @re3start
{@D0} cmp esi,ecx
{1} mov [edi+ebx*08h+__0COLON___ROWMARK],ecx { set row mark }
{2} cmovl ecx,esi
{0*} sub ecx,04h { never clears ECX register }
{1*} jnz @mark3row { forced conditional jump for Sandy Bridge }
{ x2 } xor esi,esi
{@E0} { x4 } lea eax,[ebp+ebp+00h]
@@4TH_STEP: { 5 AGU + 3 EX uops on Kaveri }
{@E4-} mov ebx,edx
@4TH_STEP:
{@E6} mov edx,[edi+ecx*08h+__0STARROW]
{2} mov [edi+ebx*08h+__0STAR],ecx
{0} mov [edi+ecx*08h+__0STARROW],ebx
{@F0} mov ecx,[edi+edx*08h+__0COLON___ROWMARK]
{2**} cmp edx,00h
{0**} jnz @@4TH_STEP
{ } sub esi,ebp
{ } sub edx,ebp
{ } lea ecx,[esi-04h] { mov ecx,[edi+__INITCOL] }
@@1ST_STEP: { K10:2.8 Core2:2.9 - 3.2 uop/clk - 1500*2+6100 }
{@00} mov eax,[edi+esi*08h+__0STARROW] { 4 AGU + 7 EX uops on Kaveri }
{1} and ebx,eax { 2 clk 6 ALU ops on Core 2 }
{2} not eax
{0} mov [edi+esi*08h+__COLMARK],eax
{1} mov eax,[edi+esi*08h+__FIXEDROW]
{2} cmovs ecx,esi
{0} mov [edi+esi*08h+__0COLON___ROWMARK],eax
{1*} add esi,04h
{2*} jnz @@1ST_STEP { clears ESI register }
{ } mov [esp+__SIZE],esi
{ -} xor ebx,ebx
{@21*} add ecx,04h { long jump instruction }
{ *} jnz @@2ND_STEP { ===>>> EBX: 00h EDX:negative ECX:initcol (>= EBP) }
{ } mov esi,[esp+ebp+04h+__MARKS]
{ -} mov ebx,edi { work matrix unmodified } { [esp+__SAVE] }
@@results:

[ Szerkesztve ]

üzenetek