diff -urp lame-398-orig/libmp3lame/i386/choose_table.nas lame-398/libmp3lame/i386/choose_table.nas --- lame-398-orig/libmp3lame/i386/choose_table.nas 2008-07-16 21:47:19.000000000 +0200 +++ lame-398/libmp3lame/i386/choose_table.nas 2008-07-16 21:47:30.000000000 +0200 @@ -111,33 +111,42 @@ choose_table_H dw 0x1d16, 0x1e16, 0x1e17, 0x1f17, 0x1f17 choose_jump_table_L: - dd table_MMX.L_case_0 - dd table_MMX.L_case_1 - dd table_MMX.L_case_2 - dd table_MMX.L_case_3 - dd table_MMX.L_case_45 - dd table_MMX.L_case_45 - dd table_MMX.L_case_67 - dd table_MMX.L_case_67 - dd table_MMX.L_case_8_15 - dd table_MMX.L_case_8_15 - dd table_MMX.L_case_8_15 - dd table_MMX.L_case_8_15 - dd table_MMX.L_case_8_15 - dd table_MMX.L_case_8_15 - dd table_MMX.L_case_8_15 - dd table_MMX.L_case_8_15 + dd table_MMX.L_case_0 - choose_table_MMX + dd table_MMX.L_case_1 - choose_table_MMX + dd table_MMX.L_case_2 - choose_table_MMX + dd table_MMX.L_case_3 - choose_table_MMX + dd table_MMX.L_case_45 - choose_table_MMX + dd table_MMX.L_case_45 - choose_table_MMX + dd table_MMX.L_case_67 - choose_table_MMX + dd table_MMX.L_case_67 - choose_table_MMX + dd table_MMX.L_case_8_15 - choose_table_MMX + dd table_MMX.L_case_8_15 - choose_table_MMX + dd table_MMX.L_case_8_15 - choose_table_MMX + dd table_MMX.L_case_8_15 - choose_table_MMX + dd table_MMX.L_case_8_15 - choose_table_MMX + dd table_MMX.L_case_8_15 - choose_table_MMX + dd table_MMX.L_case_8_15 - choose_table_MMX + dd table_MMX.L_case_8_15 - choose_table_MMX segment_code ; ; use MMX ; +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bp: + mov ebp, [esp] + retn + align 16 ; int choose_table(int *ix, int *end, int *s) choose_table_MMX: - mov ecx,[esp+4] ;ecx = begin - mov edx,[esp+8] ;edx = end + push ebp + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + + mov ecx,[esp+8] ;ecx = begin + mov edx,[esp+12] ;edx = end sub ecx,edx ;ecx = begin-end(should be minus) test ecx,8 pxor mm0,mm0 ;mm0=[0:0] @@ -169,13 +178,16 @@ choose_table_MMX: cmp eax,15 ja .with_ESC - jmp [choose_jump_table_L+eax*4] + lea ecx,[ebp + choose_table_MMX wrt ..gotoff] + add ecx,[ebp + choose_jump_table_L+eax*4 wrt ..gotoff] + jmp ecx .with_ESC1: emms - mov ecx, [esp+12] ; *s + mov ecx, [esp+16] ; *s mov [ecx], eax or eax,-1 + pop ebp ret .with_ESC: @@ -187,12 +199,12 @@ choose_table_MMX: push esi bsr eax, eax %assign _P 4*2 - movq mm5, [D15_15_15_15] - movq mm6, [D14_14_14_14] - movq mm3, [mul_add] + movq mm5, [ebp + D15_15_15_15 wrt ..gotoff] + movq mm6, [ebp + D14_14_14_14 wrt ..gotoff] + movq mm3, [ebp + mul_add wrt ..gotoff] - mov ecx, [esp+_P+4] ; = ix -; mov edx, [esp+_P+8] ; = end + mov ecx, [esp+_P+8] ; = ix +; mov edx, [esp+_P+12] ; = end sub ecx, edx xor esi, esi ; sum = 0 @@ -209,7 +221,7 @@ choose_table_MMX: psubw mm7, mm2 ; 14$B$h$jBg$-$$$H$-(B linbits_sum++; pmaddwd mm0, mm3 ; {0, 0, y, x}*{1, 16, 1, 16} movd ebx, mm0 - mov esi, [largetbl+ebx*4+(16*16+16)*4] + mov esi, [ebp + largetbl+ebx*4+(16*16+16)*4 wrt ..gotoff] jz .H_dual_exit @@ -224,9 +236,9 @@ choose_table_MMX: pmaddwd mm0, mm3 ; {y, x, y, x}*{1, 16, 1, 16} movd ebx, mm0 punpckhdq mm0,mm0 - add esi, [largetbl+ebx*4+(16*16+16)*4] + add esi, [ebp + largetbl+ebx*4+(16*16+16)*4 wrt ..gotoff] movd ebx, mm0 - add esi, [largetbl+ebx*4+(16*16+16)*4] + add esi, [ebp + largetbl+ebx*4+(16*16+16)*4 wrt ..gotoff] add ecx, 16 psubw mm7, mm2 ; 14$B$h$jBg$-$$$H$-(B linbits_sum++; jnz .H_dual_lp1 @@ -237,8 +249,8 @@ choose_table_MMX: paddd mm7,mm1 punpckldq mm7,mm7 - pmaddwd mm7, [linbits32+eax*8] ; linbits - mov ax, [choose_table_H+eax*2] + pmaddwd mm7, [ebp + linbits32+eax*8 wrt ..gotoff] ; linbits + mov ax, [ebp + choose_table_H+eax*2 wrt ..gotoff] movd ecx, mm7 punpckhdq mm7,mm7 @@ -261,54 +273,57 @@ choose_table_MMX: mov edx, ecx shr eax, 8 .chooseE_s1: - mov ecx, [esp+12] ; *s + mov ecx, [esp+16] ; *s and eax, 0xff add [ecx], edx + pop ebp ret table_MMX.L_case_0: emms + pop ebp ret table_MMX.L_case_1: emms - mov eax, [esp+12] ; *s - mov ecx, [esp+4] ; *ix + mov eax, [esp+16] ; *s + mov ecx, [esp+8] ; *ix sub ecx, edx push ebx .lp: mov ebx, [edx+ecx] add ebx, ebx add ebx, [edx+ecx+4] - movzx ebx, byte [ebx+t1l] + movzx ebx, byte [ebp + ebx+t1l wrt ..gotoff] add [eax], ebx add ecx, 8 jnz .lp pop ebx mov eax, 1 + pop ebp ret table_MMX.L_case_45: push dword 7 - mov ecx, tableABC+9*8 + lea ecx, [ebp + tableABC+9*8 wrt ..gotoff] jmp from3 table_MMX.L_case_67: push dword 10 - mov ecx, tableABC + lea ecx, [ebp + tableABC wrt ..gotoff] jmp from3 table_MMX.L_case_8_15: push dword 13 - mov ecx, tableDEF + lea ecx, [ebp + tableDEF wrt ..gotoff] from3: - mov eax,[esp+8] ;eax = *begin -; mov edx,[esp+12] ;edx = *end + mov eax,[esp+12] ;eax = *begin +; mov edx,[esp+16] ;edx = *end push ebx sub eax, edx - movq mm5,[mul_add] + movq mm5,[ebp + mul_add wrt ..gotoff] pxor mm2,mm2 ;mm2 = sum test eax, 8 @@ -361,22 +376,23 @@ from3: .choose3_s2: pop ecx add eax, ecx - mov ecx, [esp+12] ; *s + mov ecx, [esp+16] ; *s add [ecx], edx + pop ebp ret table_MMX.L_case_2: push dword 2 - mov ecx,table23 - pmov mm5,[mul_add23] + lea ecx,[ebp + table23 wrt ..gotoff] + pmov mm5,[ebp + mul_add23 wrt ..gotoff] jmp from2 table_MMX.L_case_3: push dword 5 - mov ecx,table56 - pmov mm5,[mul_add56] + lea ecx,[ebp + table56 wrt ..gotoff] + pmov mm5,[ebp + mul_add56 wrt ..gotoff] from2: - mov eax,[esp+8] ;eax = *begin -; mov edx,[esp+12] ;edx = *end + mov eax,[esp+12] ;eax = *begin +; mov edx,[esp+16] ;edx = *end push ebx push edi @@ -426,8 +442,9 @@ from2: mov edx, ecx inc eax .choose2_s1: - mov ecx, [esp+12] ; *s + mov ecx, [esp+16] ; *s add [ecx], edx + pop ebp ret end diff -urp lame-398-orig/libmp3lame/i386/fft3dn.nas lame-398/libmp3lame/i386/fft3dn.nas --- lame-398-orig/libmp3lame/i386/fft3dn.nas 2008-07-16 21:47:19.000000000 +0200 +++ lame-398/libmp3lame/i386/fft3dn.nas 2008-07-16 21:47:30.000000000 +0200 @@ -24,26 +24,35 @@ D_1_0_0_0 dd 0.0 , 1.0 segment_code +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bp: + mov ebp, [esp] + retn + ;void fht_3DN(float *fz, int nn); proc fht_3DN pushd ebp, ebx, esi, edi - mov r0, [esp+20] ;fi - mov r1, [esp+24] ;r1 = nn - sub esp, 16 + sub esp, 20 + + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + mov r0, [esp+40] ;fi + mov r1, [esp+44] ;r1 = nn + lea r3, [ebp + costab wrt ..gotoff] ;tri = costab + lea r4, [r0+r1*8] ;r4 = fn = &fz[n] + mov [esp+16], r4 mov r4, 8 ;kx = k1/2 - mov r3, costab ;tri = costab - lea r6, [r0+r1*8] ;r6 = fn = &fz[n] pmov mm7, [r3] loopalign 16 .do1 lea r3, [r3+16] ;tri += 2; - pmov mm6, [costab+8] + pmov mm6, [ebp + costab+8 wrt ..gotoff] lea r2, [r4+r4*2] ;k3*fsize/2 mov r5, 4 ;i = 1*fsize @@ -104,7 +113,7 @@ proc fht_3DN pmovd [r1+r4*4], mm4 ;gi[k2] puphdq mm4, mm4 - cmp r0, r6 + cmp r0, [esp + 16] pmovd [r1+r4*2], mm0 ;gi[k1] pmovd [r1+r2*2], mm4 ;gi[k3] @@ -119,12 +128,12 @@ proc fht_3DN ; mm7 = 0x800000000 | 0 ; pmov mm1, mm6 - mov r0, [esp+36] ; fz + mov r0, [esp+40] ; fz puphdq mm1, mm1 ; c1 | c1 lea r1, [r0+r4*2] pfadd mm1, mm1 ; c1+c1 | c1+c1 pfmul mm1, mm6 ; 2*c1*c1 | 2*c1*s1 - pfsub mm1, [D_1_0_0_0] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2 + pfsub mm1, [ebp + D_1_0_0_0 wrt ..gotoff] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2 pmov mm0, mm1 pxor mm7, mm6 ; c1 | -s1 @@ -134,7 +143,7 @@ proc fht_3DN puphdq mm0, mm2 ; s2 | c2 puphdq mm6, mm3 ;-s1 | c1 - pxor mm0, [costab] ; c2 | -s2 + pxor mm0, [ebp + costab wrt ..gotoff] ; c2 | -s2 ; mm0 = s2| c2 ; mm1 = -c2| s2 @@ -233,7 +242,7 @@ proc fht_3DN lea r0, [r0+r4*8] lea r1, [r1+r4*8] - cmp r0, r6 + cmp r0, [esp + 16] pmov mm0, [esp] pmov mm1, [esp+8] @@ -249,17 +258,17 @@ proc fht_3DN pfsub mm6, mm7 ; c1*a-s1*b | s1*a+c1*b pupldq mm7,mm6 puphdq mm6,mm7 - pmov mm7, [costab] + pmov mm7, [ebp + costab wrt ..gotoff] jb near .for - mov r0, [esp+36] ;fi - cmp r4, [esp+36+4] + mov r0, [esp+40] ;fi + cmp r4, [esp+40+4] lea r4, [r4*4] ;kx *= 4 jb near .do1 .exitttt femms - add esp,16 + add esp,20 popd ebp, ebx, esi, edi endproc @@ -270,20 +279,24 @@ proc fht_E3DN pushd ebp, ebx, esi, edi - mov r0, [esp+20] ;fi - mov r1, [esp+24] ;r1 = nn - sub esp, 16 + sub esp, 20 + + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc + mov r0, [esp+40] ;fi + mov r1, [esp+44] ;r1 = nn + lea r3, [ebp + costab wrt ..gotoff] ;tri = costab + lea r4, [r0+r1*8] ;r4 = fn = &fz[n] + mov [esp+16], r4 mov r4, 8 ;kx = k1/2 - mov r3, costab ;tri = costab - lea r6, [r0+r1*8] ;r6 = fn = &fz[n] pmov mm7, [r3] loopalign 16 .do1 lea r3, [r3+16] ;tri += 2; - pmov mm6, [costab+8] + pmov mm6, [ebp + costab+8 wrt ..gotoff] lea r2, [r4+r4*2] ;k3*fsize/2 mov r5, 4 ;i = 1*fsize @@ -324,7 +337,7 @@ proc fht_E3DN pfadd mm3, mm4 ;f0+f2|f1+f3 pfsub mm5, mm4 ;f0-f2|f1-f3 - cmp r0, r6 + cmp r0, [esp + 16] pmovd [r1+r4*2], mm3 ;gi[k1] pmovd [r1+r2*2], mm5 ;gi[k3] puphdq mm3, mm3 @@ -343,12 +356,12 @@ proc fht_E3DN ; mm7 = 0x800000000 | 0 ; pmov mm5, mm6 - mov r0, [esp+36] ; fz + mov r0, [esp+40] ; fz puphdq mm5, mm5 ; c1 | c1 lea r1, [r0+r4*2] pfadd mm5, mm5 ; c1+c1 | c1+c1 pfmul mm5, mm6 ; 2*c1*c1 | 2*c1*s1 - pfsub mm5, [D_1_0_0_0] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2 + pfsub mm5, [ebp + D_1_0_0_0 wrt ..gotoff] ; 2*c1*c1-1.0 | 2*c1*s1 = -c2 | s2 pswapd mm4, mm5 ; s2 |-c2 pxor mm4, mm7 ; s2 | c2 @@ -447,7 +460,7 @@ proc fht_E3DN lea r0, [r0+r4*8] lea r1, [r1+r4*8] - cmp r0, r6 + cmp r0, [esp + 16] pmov mm4, [esp] pmov mm5, [esp+8] @@ -462,16 +475,16 @@ proc fht_E3DN pfsub mm6, mm7 ; c1*a-s1*b | s1*a+c1*b pswapd mm6, mm6 ; ??? ; s1*a+c1*b | c1*a-s1*b - pmov mm7, [costab] + pmov mm7, [ebp + costab wrt ..gotoff] jb near .for - mov r0, [esp+36] ;fi - cmp r4, [esp+36+4] + mov r0, [esp+40] ;fi + cmp r4, [esp+40+4] lea r4, [r4*4] ;kx *= 4 jb near .do1 .exitttt femms - add esp,16 + add esp,20 popd ebp, ebx, esi, edi endproc diff -urp lame-398-orig/libmp3lame/i386/fftsse.nas lame-398/libmp3lame/i386/fftsse.nas --- lame-398-orig/libmp3lame/i386/fftsse.nas 2008-07-16 21:47:19.000000000 +0200 +++ lame-398/libmp3lame/i386/fftsse.nas 2008-07-16 21:48:10.000000000 +0200 @@ -25,6 +25,12 @@ costab_fft: S_SQRT2 dd 1.414213562 segment_code + +extern _GLOBAL_OFFSET_TABLE_ +get_pc.bp: + mov ebp, [esp] + retn + ;------------------------------------------------------------------------ ; by K. SAKAI ; 99/08/18 PIII 23k[clk] @@ -40,15 +46,20 @@ fht_SSE: push esi push edi push ebp -%assign _P 4*4 + +%assign _P 4*5 ;2つ目のループ - mov eax,[esp+_P+4] ;eax=fz - mov ebp,[esp+_P+8] ;=n + mov eax,[esp+_P+0] ;eax=fz + mov ebp,[esp+_P+4] ;=n shl ebp,3 add ebp,eax ; fn = fz + n, この関数終了まで不変 + push ebp + + call get_pc.bp + add ebp, _GLOBAL_OFFSET_TABLE_ + $$ - $ wrt ..gotpc - lea ecx,[costab_fft] + lea ecx,[ebp + costab_fft wrt ..gotoff] xor eax,eax mov al,8 ; =k1=1*(sizeof float) // 4, 16, 64, 256,... .lp2: ; do{ @@ -101,12 +112,12 @@ fht_SSE: ; gi[k3] = g1 - g3; fld dword [edi] fadd dword [edi+eax*2] - fld dword [S_SQRT2] + fld dword [ebp + S_SQRT2 wrt ..gotoff] fmul dword [edi+eax*4] fld dword [edi] fsub dword [edi+eax*2] - fld dword [S_SQRT2] + fld dword [ebp + S_SQRT2 wrt ..gotoff] fmul dword [edi+edx*2] fld st1 @@ -121,7 +132,7 @@ fht_SSE: fsubp st1,st0 fstp dword [edi+eax*4] - cmp ebx,ebp + cmp ebx,[esp] jl near .lp20 ; while (fi 必要 + xorps xmm7,[ebp + Q_MMPP wrt ..gotoff] ; = {-s1, -c1, +c1, +s1} -> 必要 addss xmm5,xmm5 ; = (--, --, --, 2*s1) add esi,4 ; esi = fi = fz + i shufps xmm5,xmm5,R4(0,0,0,0) ; = (2*s1, 2*s1, 2*s1, 2*s1) mulps xmm5,xmm6 ; = (2*s1*c1, 2*s1*s1, 2*s1*s1, 2*s1*c1) - subps xmm5,[D_1100] ; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2} + subps xmm5,[ebp + D_1100 wrt ..gotoff] ; = (--, 2*s1*s1-1, --, 2*s1*c1) = {-- -c2 -- s2} movaps xmm4,xmm5 shufps xmm5,xmm5,R4(2,0,2,0) ; = {-c2, s2, -c2, s2} -> 必要 - xorps xmm4,[Q_MMPP] ; = {--, c2, --, s2} + xorps xmm4,[ebp + Q_MMPP wrt ..gotoff] ; = {--, c2, --, s2} shufps xmm4,xmm4,R4(0,2,0,2) ; = {s2, c2, s2, c2} -> 必要 loopalign 16 @@ -222,7 +233,7 @@ fht_SSE: movss [edi+eax*4],xmm2 movss [esi+edx*2],xmm0 lea esi,[esi + eax*8] ; fi += (k1 * 4); - cmp esi,ebp + cmp esi,[esp] jl near .lp21 ; while (fi