Skip to content

Commit df0aac6

Browse files
committed
gcc's version of inline assembly for the maths helpers
1 parent 4d7493e commit df0aac6

File tree

1 file changed

+192
-6
lines changed

1 file changed

+192
-6
lines changed

WaveSabreCore/src/Helpers.cpp

Lines changed: 192 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,17 @@
44
#define _USE_MATH_DEFINES
55
#include <math.h>
66

7+
// TODO: make assembly equivalent for Windows x64 (use intrinsic ?)
8+
// ^--- you probably only need to change esp to rsp? -poro
9+
10+
#if (defined(_MSC_VER) && defined(_M_IX86)) || defined(__GNUC__)
11+
#define ASM_MATH_AVAILABLE (1)
12+
#else /* nor MSVC nor GCC/clang */
13+
#define ASM_MATH_AVAILABLE (0)
14+
#endif
15+
16+
#if ASM_MATH_AVAILABLE == 1
717
#if defined(_MSC_VER) && defined(_M_IX86)
8-
// TODO: make assembly equivalent for x64 (use intrinsic ?)
918
static __declspec(naked) double __vectorcall fpuPow(double x, double y)
1019
{
1120
__asm
@@ -53,7 +62,87 @@ static __declspec(naked) double __vectorcall fpuPow(double x, double y)
5362
ret
5463
}
5564
}
65+
#elif defined(__GNUC__)
66+
#if defined(__x86_64__) || defined(__i386__)
67+
__attribute__((__naked__,__noinline__)) static double fpuPow(double x, double y)
68+
{
69+
// i386 Linux ABI: pass thru the stack, return in st(0)
70+
// x86_64 SysV ABI: pass/return thru xmm0/1
71+
asm volatile(
72+
#ifdef __x86_64__
73+
"subq $8, %%rsp\n"
74+
#else
75+
"movsd 4(%%esp), %%xmm0\n"
76+
"movsd 12(%%esp), %%xmm1\n"
77+
"subl $8, %%esp\n"
78+
#endif
79+
"xorpd %%xmm2, %%xmm2\n"
80+
"comisd %%xmm2, %%xmm1\n"
81+
"jne 1f\n"
82+
83+
"fld1\n"
84+
"jmp 3f\n"
85+
86+
"1:\n"
87+
"comisd %%xmm2, %%xmm0\n"
88+
"jne 2f\n"
89+
90+
"fldz\n"
91+
"jmp 3f\n"
92+
93+
"2:\n"
94+
#ifdef __x86_64__
95+
"movsd %%xmm1, (%%rsp)\n"
96+
"fldl (%%rsp)\n"
97+
"movsd %%xmm0, (%%rsp)\n"
98+
"fldl (%%rsp)\n"
99+
#else
100+
"movsd %%xmm1, (%%esp)\n"
101+
"fldl (%%esp)\n"
102+
"movsd %%xmm0, (%%esp)\n"
103+
"fldl (%%esp)\n"
104+
#endif
105+
106+
"fyl2x\n"
107+
"fld %%st(0)\n"
108+
"frndint\n"
109+
"fsub %%st(0), %%st(1)\n"
110+
"fxch %%st(1)\n"
111+
"fchs\n"
112+
"f2xm1\n"
113+
"fld1\n"
114+
"faddp %%st(0), %%st(1)\n"
115+
"fscale\n"
116+
"fstp %%st(1)\n"
117+
118+
"3:\n"
119+
#ifdef __x86_64__
120+
"fstpl (%%rsp)\n"
121+
"movsd (%%rsp), %%xmm0\n"
122+
"addq $8, %%rsp\n"
123+
#else
124+
"addl $8, %%esp\n"
125+
#endif
126+
"ret\n"
127+
:// no output
128+
:// no input
129+
:"xmm2" // clobbered
130+
);
131+
}
132+
#else
133+
// __builtin_pow only supports integer exponents... so if the exponent
134+
// is an integer, use __builtin_pow, using some preprocessor magic
135+
#define fpuPow(x, y) \
136+
((__builtin_constant_p(y) && ((y) == (int)(y))) \
137+
? __builtin_pow(x, y) \
138+
: pow(x, y)) \
139+
140+
#endif
141+
#else
142+
#error "Unsupported compiler."
143+
#endif /* compiler */
56144

145+
#if defined(_MSC_VER) && defined(_M_IX86)
57146
static __declspec(naked) float __vectorcall fpuPowF(float x, float y)
58147
{
59148
__asm
@@ -101,7 +190,87 @@ static __declspec(naked) float __vectorcall fpuPowF(float x, float y)
101190
ret
102191
}
103192
}
193+
#elif defined(__GNUC__)
194+
#if defined(__x86_64__) || defined(__i386__)
195+
__attribute__((__naked__,__noinline__)) static float fpuPowF(float x, float y)
196+
{
197+
// i386 Linux ABI: pass thru the stack, return in st(0)
198+
// x86_64 SysV ABI: pass/return thru xmm0/1
199+
asm volatile(
200+
#ifdef __x86_64__
201+
"subq $8, %%rsp\n"
202+
#else
203+
"movss 4(%%esp), %%xmm0\n"
204+
"movss 8(%%esp), %%xmm1\n"
205+
"subl $8, %%esp\n"
206+
#endif
207+
"xorps %%xmm2, %%xmm2\n"
208+
"comiss %%xmm2, %%xmm1\n"
209+
"jne 1f\n"
210+
211+
"fld1\n"
212+
"jmp 3f\n"
213+
214+
"1:\n"
215+
"comiss %%xmm2, %%xmm0\n"
216+
"jne 2f\n"
217+
218+
"fldz\n"
219+
"jmp 3f\n"
220+
221+
"2:\n"
222+
#ifdef __x86_64__
223+
"movss %%xmm1, (%%rsp)\n"
224+
"flds (%%rsp)\n"
225+
"movss %%xmm0, (%%rsp)\n"
226+
"flds (%%rsp)\n"
227+
#else
228+
"movss %%xmm1, (%%esp)\n"
229+
"flds (%%esp)\n"
230+
"movss %%xmm0, (%%esp)\n"
231+
"flds (%%esp)\n"
232+
#endif
233+
234+
"fyl2x\n"
235+
"fld %%st(0)\n"
236+
"frndint\n"
237+
"fsub %%st(0), %%st(1)\n"
238+
"fxch %%st(1)\n"
239+
"fchs\n"
240+
"f2xm1\n"
241+
"fld1\n"
242+
"faddp %%st(0), %%st(1)\n"
243+
"fscale\n"
244+
"fstp %%st(1)\n"
245+
246+
"3:\n"
247+
#ifdef __x86_64__
248+
"fstps (%%rsp)\n"
249+
"movss (%%rsp), %%xmm0\n"
250+
"addq $8, %%rsp\n"
251+
#else
252+
"addl $8, %%esp\n"
253+
#endif
254+
"ret\n"
255+
:// no output
256+
:// no input
257+
:"xmm2" // clobbered
258+
);
259+
}
260+
#else
261+
// __builtin_powf only supports integer exponents... so if the exponent
262+
// is an integer, use __builtin_powf, using some preprocessor magic
263+
#define fpuPowF(x, y) \
264+
((__builtin_constant_p(y) && ((y) == (int)(y))) \
265+
? __builtin_powf(x, y) \
266+
: powf(x, y)) \
267+
268+
#endif
269+
#else
270+
#error "Unsupported compiler."
271+
#endif /* compiler */
104272

273+
#if defined(_MSC_VER) && defined(_M_IX86)
105274
static __declspec(naked) double __vectorcall fpuCos(double x)
106275
{
107276
__asm
@@ -119,7 +288,24 @@ static __declspec(naked) double __vectorcall fpuCos(double x)
119288
ret
120289
}
121290
}
122-
#endif // defined(_MSC_VER) && defined(_M_IX86)
291+
#elif defined(__GNUC__)
292+
#if defined(__x86_64__) || defined(__i386__)
293+
__attribute__((__always_inline__)) inline static double fpuCos(double x)
294+
{
295+
// not writing the *entire* function body in assembly actually helps
296+
// gcc and clang with inlining and LTO
297+
// ... except trying this with fpuPow/F somehow got botched, so those I
298+
// wrote as pure assembly
299+
asm volatile("fcos\n":"+t"(x)::);
300+
return x;
301+
}
302+
#else /* x86_64 */
303+
#define fpuCos(x) __builtin_cos(x)
304+
#endif /* GNUC, platform */
305+
#else
306+
#error "Unsupported compiler."
307+
#endif /* compiler */
308+
#endif // ASM_MATH_AVAILABLE == 1
123309

124310
namespace WaveSabreCore
125311
{
@@ -138,7 +324,7 @@ namespace WaveSabreCore
138324
for (int i = 0; i < fastCosTabSize + 1; i++)
139325
{
140326
double phase = double(i) * ((M_PI * 2) / fastCosTabSize);
141-
#if defined(_MSC_VER) && defined(_M_IX86)
327+
#if ASM_MATH_AVAILABLE == 1
142328
fastCosTab[i] = fpuCos(phase);
143329
#else
144330
fastCosTab[i] = cos(phase);
@@ -153,7 +339,7 @@ namespace WaveSabreCore
153339

154340
double Helpers::Pow(double x, double y)
155341
{
156-
#if defined(_MSC_VER) && defined(_M_IX86)
342+
#if ASM_MATH_AVAILABLE == 1
157343
return fpuPow(x, y);
158344
#else
159345
return pow(x, y);
@@ -162,7 +348,7 @@ namespace WaveSabreCore
162348

163349
float Helpers::PowF(float x, float y)
164350
{
165-
#if defined(_MSC_VER) && defined(_M_IX86)
351+
#if ASM_MATH_AVAILABLE == 1
166352
return fpuPowF(x, y);
167353
#else
168354
return powf(x, y);
@@ -365,7 +551,7 @@ namespace WaveSabreCore
365551
{
366552
return (Spread)(int)(param * 2.0f);
367553
}
368-
554+
369555
float Helpers::SpreadToParam(Spread spread)
370556
{
371557
return (float)spread / 2.0f;

0 commit comments

Comments
 (0)