aboutsummaryrefslogtreecommitdiff
path: root/src/cpu
diff options
context:
space:
mode:
authorkvn <none@none>2012-05-24 18:39:44 -0700
committerkvn <none@none>2012-05-24 18:39:44 -0700
commit9e2e71a3d894ba24eace18527c4ded5dce866901 (patch)
tree688467b9a07342fcbd728b08f1bcbc1e64158cb6 /src/cpu
parent57fc1ff2ecfdcd7ea98f4ad0f1e8677db656b2b6 (diff)
parent06d52788178edc5ccd37a2bff163e803f00705a0 (diff)
Merge
Diffstat (limited to 'src/cpu')
-rw-r--r--src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp14
-rw-r--r--src/cpu/sparc/vm/interpreter_sparc.cpp2
-rw-r--r--src/cpu/x86/vm/assembler_x86.cpp442
-rw-r--r--src/cpu/x86/vm/assembler_x86.hpp24
-rw-r--r--src/cpu/x86/vm/c1_LIRAssembler_x86.cpp6
-rw-r--r--src/cpu/x86/vm/c1_LIRGenerator_x86.cpp29
-rw-r--r--src/cpu/x86/vm/c1_LinearScan_x86.cpp75
-rw-r--r--src/cpu/x86/vm/interpreter_x86_32.cpp13
-rw-r--r--src/cpu/x86/vm/interpreter_x86_64.cpp8
-rw-r--r--src/cpu/x86/vm/stubGenerator_x86_32.cpp20
-rw-r--r--src/cpu/x86/vm/stubGenerator_x86_64.cpp31
-rw-r--r--src/cpu/x86/vm/templateInterpreter_x86_32.cpp8
-rw-r--r--src/cpu/x86/vm/templateInterpreter_x86_64.cpp8
-rw-r--r--src/cpu/x86/vm/x86_32.ad221
-rw-r--r--src/cpu/x86/vm/x86_64.ad32
15 files changed, 677 insertions, 256 deletions
diff --git a/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp b/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
index abf3ab9d5..e432fedd6 100644
--- a/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
+++ b/src/cpu/sparc/vm/c1_LIRGenerator_sparc.cpp
@@ -738,7 +738,8 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
case vmIntrinsics::_dlog: // fall through
case vmIntrinsics::_dsin: // fall through
case vmIntrinsics::_dtan: // fall through
- case vmIntrinsics::_dcos: {
+ case vmIntrinsics::_dcos: // fall through
+ case vmIntrinsics::_dexp: {
assert(x->number_of_arguments() == 1, "wrong type");
address runtime_entry = NULL;
@@ -758,12 +759,23 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
case vmIntrinsics::_dlog10:
runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dlog10);
break;
+ case vmIntrinsics::_dexp:
+ runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dexp);
+ break;
default:
ShouldNotReachHere();
}
LIR_Opr result = call_runtime(x->argument_at(0), runtime_entry, x->type(), NULL);
set_result(x, result);
+ break;
+ }
+ case vmIntrinsics::_dpow: {
+ assert(x->number_of_arguments() == 2, "wrong type");
+ address runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dpow);
+ LIR_Opr result = call_runtime(x->argument_at(0), x->argument_at(1), runtime_entry, x->type(), NULL);
+ set_result(x, result);
+ break;
}
}
}
diff --git a/src/cpu/sparc/vm/interpreter_sparc.cpp b/src/cpu/sparc/vm/interpreter_sparc.cpp
index 5471ebca2..7e0623376 100644
--- a/src/cpu/sparc/vm/interpreter_sparc.cpp
+++ b/src/cpu/sparc/vm/interpreter_sparc.cpp
@@ -403,6 +403,8 @@ address AbstractInterpreterGenerator::generate_method_entry(AbstractInterpreter:
case Interpreter::java_lang_math_abs : break;
case Interpreter::java_lang_math_log : break;
case Interpreter::java_lang_math_log10 : break;
+ case Interpreter::java_lang_math_pow : break;
+ case Interpreter::java_lang_math_exp : break;
case Interpreter::java_lang_ref_reference_get
: entry_point = ((InterpreterGenerator*)this)->generate_Reference_get_entry(); break;
default : ShouldNotReachHere(); break;
diff --git a/src/cpu/x86/vm/assembler_x86.cpp b/src/cpu/x86/vm/assembler_x86.cpp
index 7c7cb3d8a..4537a24e0 100644
--- a/src/cpu/x86/vm/assembler_x86.cpp
+++ b/src/cpu/x86/vm/assembler_x86.cpp
@@ -3578,6 +3578,21 @@ void Assembler::fyl2x() {
emit_byte(0xF1);
}
+void Assembler::frndint() {
+ emit_byte(0xD9);
+ emit_byte(0xFC);
+}
+
+void Assembler::f2xm1() {
+ emit_byte(0xD9);
+ emit_byte(0xF0);
+}
+
+void Assembler::fldl2e() {
+ emit_byte(0xD9);
+ emit_byte(0xEA);
+}
+
// SSE SIMD prefix byte values corresponding to VexSimdPrefix encoding.
static int simd_pre[4] = { 0, 0x66, 0xF3, 0xF2 };
// SSE opcode second byte values (first is 0x0F) corresponding to VexOpcode encoding.
@@ -6868,6 +6883,243 @@ void MacroAssembler::fldcw(AddressLiteral src) {
Assembler::fldcw(as_Address(src));
}
+void MacroAssembler::pow_exp_core_encoding() {
+ // kills rax, rcx, rdx
+ subptr(rsp,sizeof(jdouble));
+ // computes 2^X. Stack: X ...
+ // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and
+ // keep it on the thread's stack to compute 2^int(X) later
+ // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1)
+ // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X))
+ fld_s(0); // Stack: X X ...
+ frndint(); // Stack: int(X) X ...
+ fsuba(1); // Stack: int(X) X-int(X) ...
+ fistp_s(Address(rsp,0)); // move int(X) as integer to thread's stack. Stack: X-int(X) ...
+ f2xm1(); // Stack: 2^(X-int(X))-1 ...
+ fld1(); // Stack: 1 2^(X-int(X))-1 ...
+ faddp(1); // Stack: 2^(X-int(X))
+ // computes 2^(int(X)): add exponent bias (1023) to int(X), then
+ // shift int(X)+1023 to exponent position.
+ // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11
+ // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent
+ // values so detect them and set result to NaN.
+ movl(rax,Address(rsp,0));
+ movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding
+ addl(rax, 1023);
+ movl(rdx,rax);
+ shll(rax,20);
+ // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN.
+ addl(rdx,1);
+ // Check that 1 < int(X)+1023+1 < 2048
+ // in 3 steps:
+ // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048
+ // 2- (int(X)+1023+1)&-2048 != 0
+ // 3- (int(X)+1023+1)&-2048 != 1
+ // Do 2- first because addl just updated the flags.
+ cmov32(Assembler::equal,rax,rcx);
+ cmpl(rdx,1);
+ cmov32(Assembler::equal,rax,rcx);
+ testl(rdx,rcx);
+ cmov32(Assembler::notEqual,rax,rcx);
+ movl(Address(rsp,4),rax);
+ movl(Address(rsp,0),0);
+ fmul_d(Address(rsp,0)); // Stack: 2^X ...
+ addptr(rsp,sizeof(jdouble));
+}
+
+void MacroAssembler::fast_pow() {
+ // computes X^Y = 2^(Y * log2(X))
+ // if fast computation is not possible, result is NaN. Requires
+ // fallback from user of this macro.
+ fyl2x(); // Stack: (Y*log2(X)) ...
+ pow_exp_core_encoding(); // Stack: exp(X) ...
+}
+
+void MacroAssembler::fast_exp() {
+ // computes exp(X) = 2^(X * log2(e))
+ // if fast computation is not possible, result is NaN. Requires
+ // fallback from user of this macro.
+ fldl2e(); // Stack: log2(e) X ...
+ fmulp(1); // Stack: (X*log2(e)) ...
+ pow_exp_core_encoding(); // Stack: exp(X) ...
+}
+
+void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) {
+ // kills rax, rcx, rdx
+ // pow and exp needs 2 extra registers on the fpu stack.
+ Label slow_case, done;
+ Register tmp = noreg;
+ if (!VM_Version::supports_cmov()) {
+ // fcmp needs a temporary so preserve rdx,
+ tmp = rdx;
+ }
+ Register tmp2 = rax;
+ Register tmp3 = rcx;
+
+ if (is_exp) {
+ // Stack: X
+ fld_s(0); // duplicate argument for runtime call. Stack: X X
+ fast_exp(); // Stack: exp(X) X
+ fcmp(tmp, 0, false, false); // Stack: exp(X) X
+ // exp(X) not equal to itself: exp(X) is NaN go to slow case.
+ jcc(Assembler::parity, slow_case);
+ // get rid of duplicate argument. Stack: exp(X)
+ if (num_fpu_regs_in_use > 0) {
+ fxch();
+ fpop();
+ } else {
+ ffree(1);
+ }
+ jmp(done);
+ } else {
+ // Stack: X Y
+ Label x_negative, y_odd;
+
+ fldz(); // Stack: 0 X Y
+ fcmp(tmp, 1, true, false); // Stack: X Y
+ jcc(Assembler::above, x_negative);
+
+ // X >= 0
+
+ fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y
+ fld_s(1); // Stack: X Y X Y
+ fast_pow(); // Stack: X^Y X Y
+ fcmp(tmp, 0, false, false); // Stack: X^Y X Y
+ // X^Y not equal to itself: X^Y is NaN go to slow case.
+ jcc(Assembler::parity, slow_case);
+ // get rid of duplicate arguments. Stack: X^Y
+ if (num_fpu_regs_in_use > 0) {
+ fxch(); fpop();
+ fxch(); fpop();
+ } else {
+ ffree(2);
+ ffree(1);
+ }
+ jmp(done);
+
+ // X <= 0
+ bind(x_negative);
+
+ fld_s(1); // Stack: Y X Y
+ frndint(); // Stack: int(Y) X Y
+ fcmp(tmp, 2, false, false); // Stack: int(Y) X Y
+ jcc(Assembler::notEqual, slow_case);
+
+ subptr(rsp, 8);
+
+ // For X^Y, when X < 0, Y has to be an integer and the final
+ // result depends on whether it's odd or even. We just checked
+ // that int(Y) == Y. We move int(Y) to gp registers as a 64 bit
+ // integer to test its parity. If int(Y) is huge and doesn't fit
+ // in the 64 bit integer range, the integer indefinite value will
+ // end up in the gp registers. Huge numbers are all even, the
+ // integer indefinite number is even so it's fine.
+
+#ifdef ASSERT
+ // Let's check we don't end up with an integer indefinite number
+ // when not expected. First test for huge numbers: check whether
+ // int(Y)+1 == int(Y) which is true for very large numbers and
+ // those are all even. A 64 bit integer is guaranteed to not
+ // overflow for numbers where y+1 != y (when precision is set to
+ // double precision).
+ Label y_not_huge;
+
+ fld1(); // Stack: 1 int(Y) X Y
+ fadd(1); // Stack: 1+int(Y) int(Y) X Y
+
+#ifdef _LP64
+ // trip to memory to force the precision down from double extended
+ // precision
+ fstp_d(Address(rsp, 0));
+ fld_d(Address(rsp, 0));
+#endif
+
+ fcmp(tmp, 1, true, false); // Stack: int(Y) X Y
+#endif
+
+ // move int(Y) as 64 bit integer to thread's stack
+ fistp_d(Address(rsp,0)); // Stack: X Y
+
+#ifdef ASSERT
+ jcc(Assembler::notEqual, y_not_huge);
+
+ // Y is huge so we know it's even. It may not fit in a 64 bit
+ // integer and we don't want the debug code below to see the
+ // integer indefinite value so overwrite int(Y) on the thread's
+ // stack with 0.
+ movl(Address(rsp, 0), 0);
+ movl(Address(rsp, 4), 0);
+
+ bind(y_not_huge);
+#endif
+
+ fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y
+ fld_s(1); // Stack: X Y X Y
+ fabs(); // Stack: abs(X) Y X Y
+ fast_pow(); // Stack: abs(X)^Y X Y
+ fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y
+ // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case.
+
+ pop(tmp2);
+ NOT_LP64(pop(tmp3));
+ jcc(Assembler::parity, slow_case);
+
+#ifdef ASSERT
+ // Check that int(Y) is not integer indefinite value (int
+ // overflow). Shouldn't happen because for values that would
+ // overflow, 1+int(Y)==Y which was tested earlier.
+#ifndef _LP64
+ {
+ Label integer;
+ testl(tmp2, tmp2);
+ jcc(Assembler::notZero, integer);
+ cmpl(tmp3, 0x80000000);
+ jcc(Assembler::notZero, integer);
+ stop("integer indefinite value shouldn't be seen here");
+ bind(integer);
+ }
+#else
+ {
+ Label integer;
+ mov(tmp3, tmp2); // preserve tmp2 for parity check below
+ shlq(tmp3, 1);
+ jcc(Assembler::carryClear, integer);
+ jcc(Assembler::notZero, integer);
+ stop("integer indefinite value shouldn't be seen here");
+ bind(integer);
+ }
+#endif
+#endif
+
+ // get rid of duplicate arguments. Stack: X^Y
+ if (num_fpu_regs_in_use > 0) {
+ fxch(); fpop();
+ fxch(); fpop();
+ } else {
+ ffree(2);
+ ffree(1);
+ }
+
+ testl(tmp2, 1);
+ jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y
+ // X <= 0, Y even: X^Y = -abs(X)^Y
+
+ fchs(); // Stack: -abs(X)^Y Y
+ jmp(done);
+ }
+
+ // slow case: runtime call
+ bind(slow_case);
+
+ fpop(); // pop incorrect result or int(Y)
+
+ fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow),
+ is_exp ? 1 : 2, num_fpu_regs_in_use);
+
+ // Come here with result in F-TOS
+ bind(done);
+}
+
void MacroAssembler::fpop() {
ffree();
fincstp();
@@ -8045,6 +8297,144 @@ void MacroAssembler::incr_allocated_bytes(Register thread,
#endif
}
+void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) {
+ pusha();
+
+ // if we are coming from c1, xmm registers may be live
+ if (UseSSE >= 1) {
+ subptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
+ }
+ int off = 0;
+ if (UseSSE == 1) {
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm0);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm1);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm2);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm3);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm4);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm5);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm6);
+ movflt(Address(rsp,off++*sizeof(jdouble)),xmm7);
+ } else if (UseSSE >= 2) {
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm0);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm1);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm2);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm3);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm4);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm5);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm6);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm7);
+#ifdef _LP64
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm8);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm9);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm10);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm11);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm12);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm13);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm14);
+ movdbl(Address(rsp,off++*sizeof(jdouble)),xmm15);
+#endif
+ }
+
+ // Preserve registers across runtime call
+ int incoming_argument_and_return_value_offset = -1;
+ if (num_fpu_regs_in_use > 1) {
+ // Must preserve all other FPU regs (could alternatively convert
+ // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash
+ // FPU state, but can not trust C compiler)
+ NEEDS_CLEANUP;
+ // NOTE that in this case we also push the incoming argument(s) to
+ // the stack and restore it later; we also use this stack slot to
+ // hold the return value from dsin, dcos etc.
+ for (int i = 0; i < num_fpu_regs_in_use; i++) {
+ subptr(rsp, sizeof(jdouble));
+ fstp_d(Address(rsp, 0));
+ }
+ incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
+ for (int i = nb_args-1; i >= 0; i--) {
+ fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble)));
+ }
+ }
+
+ subptr(rsp, nb_args*sizeof(jdouble));
+ for (int i = 0; i < nb_args; i++) {
+ fstp_d(Address(rsp, i*sizeof(jdouble)));
+ }
+
+#ifdef _LP64
+ if (nb_args > 0) {
+ movdbl(xmm0, Address(rsp, 0));
+ }
+ if (nb_args > 1) {
+ movdbl(xmm1, Address(rsp, sizeof(jdouble)));
+ }
+ assert(nb_args <= 2, "unsupported number of args");
+#endif // _LP64
+
+ // NOTE: we must not use call_VM_leaf here because that requires a
+ // complete interpreter frame in debug mode -- same bug as 4387334
+ // MacroAssembler::call_VM_leaf_base is perfectly safe and will
+ // do proper 64bit abi
+
+ NEEDS_CLEANUP;
+ // Need to add stack banging before this runtime call if it needs to
+ // be taken; however, there is no generic stack banging routine at
+ // the MacroAssembler level
+
+ MacroAssembler::call_VM_leaf_base(runtime_entry, 0);
+
+#ifdef _LP64
+ movsd(Address(rsp, 0), xmm0);
+ fld_d(Address(rsp, 0));
+#endif // _LP64
+ addptr(rsp, sizeof(jdouble) * nb_args);
+ if (num_fpu_regs_in_use > 1) {
+ // Must save return value to stack and then restore entire FPU
+ // stack except incoming arguments
+ fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
+ for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) {
+ fld_d(Address(rsp, 0));
+ addptr(rsp, sizeof(jdouble));
+ }
+ fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble)));
+ addptr(rsp, sizeof(jdouble) * nb_args);
+ }
+
+ off = 0;
+ if (UseSSE == 1) {
+ movflt(xmm0, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm1, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm2, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm3, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm4, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm5, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm6, Address(rsp,off++*sizeof(jdouble)));
+ movflt(xmm7, Address(rsp,off++*sizeof(jdouble)));
+ } else if (UseSSE >= 2) {
+ movdbl(xmm0, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm1, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm2, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm3, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm4, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm5, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm6, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm7, Address(rsp,off++*sizeof(jdouble)));
+#ifdef _LP64
+ movdbl(xmm8, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm9, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm10, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm11, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm12, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm13, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm14, Address(rsp,off++*sizeof(jdouble)));
+ movdbl(xmm15, Address(rsp,off++*sizeof(jdouble)));
+#endif
+ }
+ if (UseSSE >= 1) {
+ addptr(rsp, sizeof(jdouble)* LP64_ONLY(16) NOT_LP64(8));
+ }
+ popa();
+}
+
static const double pi_4 = 0.7853981633974483;
void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
@@ -8092,73 +8482,27 @@ void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) {
// slow case: runtime call
bind(slow_case);
- // Preserve registers across runtime call
- pusha();
- int incoming_argument_and_return_value_offset = -1;
- if (num_fpu_regs_in_use > 1) {
- // Must preserve all other FPU regs (could alternatively convert
- // SharedRuntime::dsin and dcos into assembly routines known not to trash
- // FPU state, but can not trust C compiler)
- NEEDS_CLEANUP;
- // NOTE that in this case we also push the incoming argument to
- // the stack and restore it later; we also use this stack slot to
- // hold the return value from dsin or dcos.
- for (int i = 0; i < num_fpu_regs_in_use; i++) {
- subptr(rsp, sizeof(jdouble));
- fstp_d(Address(rsp, 0));
- }
- incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1);
- fld_d(Address(rsp, incoming_argument_and_return_value_offset));
- }
- subptr(rsp, sizeof(jdouble));
- fstp_d(Address(rsp, 0));
-#ifdef _LP64
- movdbl(xmm0, Address(rsp, 0));
-#endif // _LP64
- // NOTE: we must not use call_VM_leaf here because that requires a
- // complete interpreter frame in debug mode -- same bug as 4387334
- // MacroAssembler::call_VM_leaf_base is perfectly safe and will
- // do proper 64bit abi
-
- NEEDS_CLEANUP;
- // Need to add stack banging before this runtime call if it needs to
- // be taken; however, there is no generic stack banging routine at
- // the MacroAssembler level
switch(trig) {
case 's':
{
- MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 0);
+ fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use);
}
break;
case 'c':
{
- MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 0);
+ fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use);
}
break;
case 't':
{
- MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 0);
+ fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use);
}
break;
default:
assert(false, "bad intrinsic");
break;
}
-#ifdef _LP64
- movsd(Address(rsp, 0), xmm0);
- fld_d(Address(rsp, 0));
-#endif // _LP64
- addptr(rsp, sizeof(jdouble));
- if (num_fpu_regs_in_use > 1) {
- // Must save return value to stack and then restore entire FPU stack
- fstp_d(Address(rsp, incoming_argument_and_return_value_offset));
- for (int i = 0; i < num_fpu_regs_in_use; i++) {
- fld_d(Address(rsp, 0));
- addptr(rsp, sizeof(jdouble));
- }
- }
- popa();
// Come here with result in F-TOS
bind(done);
diff --git a/src/cpu/x86/vm/assembler_x86.hpp b/src/cpu/x86/vm/assembler_x86.hpp
index 4a72da83b..d0f1171f9 100644
--- a/src/cpu/x86/vm/assembler_x86.hpp
+++ b/src/cpu/x86/vm/assembler_x86.hpp
@@ -1148,6 +1148,9 @@ private:
void fxsave(Address dst);
void fyl2x();
+ void frndint();
+ void f2xm1();
+ void fldl2e();
void hlt();
@@ -2387,7 +2390,28 @@ class MacroAssembler: public Assembler {
void ldmxcsr(Address src) { Assembler::ldmxcsr(src); }
void ldmxcsr(AddressLiteral src);
+ // compute pow(x,y) and exp(x) with x86 instructions. Don't cover
+ // all corner cases and may result in NaN and require fallback to a
+ // runtime call.
+ void fast_pow();
+ void fast_exp();
+
+ // computes exp(x). Fallback to runtime call included.
+ void exp_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(true, num_fpu_regs_in_use); }
+ // computes pow(x,y). Fallback to runtime call included.
+ void pow_with_fallback(int num_fpu_regs_in_use) { pow_or_exp(false, num_fpu_regs_in_use); }
+
private:
+
+ // call runtime as a fallback for trig functions and pow/exp.
+ void fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use);
+
+ // computes 2^(Ylog2X); Ylog2X in ST(0)
+ void pow_exp_core_encoding();
+
+ // computes pow(x,y) or exp(x). Fallback to runtime call included.
+ void pow_or_exp(bool is_exp, int num_fpu_regs_in_use);
+
// these are private because users should be doing movflt/movdbl
void movss(Address dst, XMMRegister src) { Assembler::movss(dst, src); }
diff --git a/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp b/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
index 4b2f8699e..764c9cff7 100644
--- a/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
+++ b/src/cpu/x86/vm/c1_LIRAssembler_x86.cpp
@@ -2446,6 +2446,12 @@ void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr unused, L
// Should consider not saving rbx, if not necessary
__ trigfunc('t', op->as_Op2()->fpu_stack_size());
break;
+ case lir_exp :
+ __ exp_with_fallback(op->as_Op2()->fpu_stack_size());
+ break;
+ case lir_pow :
+ __ pow_with_fallback(op->as_Op2()->fpu_stack_size());
+ break;
default : ShouldNotReachHere();
}
} else {
diff --git a/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp b/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
index 1ff91cafb..66d8ed0a8 100644
--- a/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
+++ b/src/cpu/x86/vm/c1_LIRGenerator_x86.cpp
@@ -823,7 +823,7 @@ void LIRGenerator::do_CompareAndSwap(Intrinsic* x, ValueType* type) {
void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
- assert(x->number_of_arguments() == 1, "wrong type");
+ assert(x->number_of_arguments() == 1 || (x->number_of_arguments() == 2 && x->id() == vmIntrinsics::_dpow), "wrong type");
LIRItem value(x->argument_at(0), this);
bool use_fpu = false;
@@ -834,6 +834,8 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
case vmIntrinsics::_dtan:
case vmIntrinsics::_dlog:
case vmIntrinsics::_dlog10:
+ case vmIntrinsics::_dexp:
+ case vmIntrinsics::_dpow:
use_fpu = true;
}
} else {
@@ -843,20 +845,37 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
value.load_item();
LIR_Opr calc_input = value.result();
+ LIR_Opr calc_input2 = NULL;
+ if (x->id() == vmIntrinsics::_dpow) {
+ LIRItem extra_arg(x->argument_at(1), this);
+ if (UseSSE < 2) {
+ extra_arg.set_destroys_register();
+ }
+ extra_arg.load_item();
+ calc_input2 = extra_arg.result();
+ }
LIR_Opr calc_result = rlock_result(x);
- // sin and cos need two free fpu stack slots, so register two temporary operands
+ // sin, cos, pow and exp need two free fpu stack slots, so register
+ // two temporary operands
LIR_Opr tmp1 = FrameMap::caller_save_fpu_reg_at(0);
LIR_Opr tmp2 = FrameMap::caller_save_fpu_reg_at(1);
if (use_fpu) {
LIR_Opr tmp = FrameMap::fpu0_double_opr;
+ int tmp_start = 1;
+ if (calc_input2 != NULL) {
+ __ move(calc_input2, tmp);
+ tmp_start = 2;
+ calc_input2 = tmp;
+ }
__ move(calc_input, tmp);
calc_input = tmp;
calc_result = tmp;
- tmp1 = FrameMap::caller_save_fpu_reg_at(1);
- tmp2 = FrameMap::caller_save_fpu_reg_at(2);
+
+ tmp1 = FrameMap::caller_save_fpu_reg_at(tmp_start);
+ tmp2 = FrameMap::caller_save_fpu_reg_at(tmp_start + 1);
}
switch(x->id()) {
@@ -867,6 +886,8 @@ void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
case vmIntrinsics::_dtan: __ tan (calc_input, calc_result, tmp1, tmp2); break;
case vmIntrinsics::_dlog: __ log (calc_input, calc_result, tmp1); break;
case vmIntrinsics::_dlog10: __ log10(calc_input, calc_result, tmp1); break;
+ case vmIntrinsics::_dexp: __ exp (calc_input, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
+ case vmIntrinsics::_dpow: __ pow (calc_input, calc_input2, calc_result, tmp1, tmp2, FrameMap::rax_opr, FrameMap::rcx_opr, FrameMap::rdx_opr); break;
default: ShouldNotReachHere();
}
diff --git a/src/cpu/x86/vm/c1_LinearScan_x86.cpp b/src/cpu/x86/vm/c1_LinearScan_x86.cpp
index 0c19851b3..77859b9d6 100644
--- a/src/cpu/x86/vm/c1_LinearScan_x86.cpp
+++ b/src/cpu/x86/vm/c1_LinearScan_x86.cpp
@@ -690,8 +690,8 @@ void FpuStackAllocator::handle_op2(LIR_Op2* op2) {
case lir_mul_strictfp:
case lir_div_strictfp: {
- assert(op2->tmp_opr()->is_fpu_register(), "strict operations need temporary fpu stack slot");
- insert_free_if_dead(op2->tmp_opr());
+ assert(op2->tmp1_opr()->is_fpu_register(), "strict operations need temporary fpu stack slot");
+ insert_free_if_dead(op2->tmp1_opr());
assert(sim()->stack_size() <= 7, "at least one stack slot must be free");
// fall-through: continue with the normal handling of lir_mul and lir_div
}
@@ -787,16 +787,17 @@ void FpuStackAllocator::handle_op2(LIR_Op2* op2) {
case lir_log:
case lir_log10: {
- // log and log10 needs one temporary fpu stack slot, so there is ontemporary
- // registers stored in temp of the operation.
- // the stack allocator must guarantee that the stack slots are really free,
- // otherwise there might be a stack overflow.
+ // log and log10 need one temporary fpu stack slot, so
+ // there is one temporary registers stored in temp of the
+ // operation. the stack allocator must guarantee that the stack
+ // slots are really free, otherwise there might be a stack
+ // overflow.
assert(right->is_illegal(), "must be");
assert(left->is_fpu_register(), "must be");
assert(res->is_fpu_register(), "must be");
- assert(op2->tmp_opr()->is_fpu_register(), "must be");
+ assert(op2->tmp1_opr()->is_fpu_register(), "must be");
- insert_free_if_dead(op2->tmp_opr());
+ insert_free_if_dead(op2->tmp1_opr());
insert_free_if_dead(res, left);
insert_exchange(left);
do_rename(left, res);
@@ -812,8 +813,9 @@ void FpuStackAllocator::handle_op2(LIR_Op2* op2) {
case lir_tan:
case lir_sin:
- case lir_cos: {
- // sin and cos need two temporary fpu stack slots, so there are two temporary
+ case lir_cos:
+ case lir_exp: {
+ // sin, cos and exp need two temporary fpu stack slots, so there are two temporary
// registers (stored in right and temp of the operation).
// the stack allocator must guarantee that the stack slots are really free,
// otherwise there might be a stack overflow.
@@ -821,11 +823,11 @@ void FpuStackAllocator::handle_op2(LIR_Op2* op2) {
assert(res->is_fpu_register(), "must be");
// assert(left->is_last_use(), "old value gets destroyed");
assert(right->is_fpu_register(), "right is used as the first temporary register");
- assert(op2->tmp_opr()->is_fpu_register(), "temp is used as the second temporary register");
- assert(fpu_num(left) != fpu_num(right) && fpu_num(right) != fpu_num(op2->tmp_opr()) && fpu_num(op2->tmp_opr()) != fpu_num(res), "need distinct temp registers");
+ assert(op2->tmp1_opr()->is_fpu_register(), "temp is used as the second temporary register");
+ assert(fpu_num(left) != fpu_num(right) && fpu_num(right) != fpu_num(op2->tmp1_opr()) && fpu_num(op2->tmp1_opr()) != fpu_num(res), "need distinct temp registers");
insert_free_if_dead(right);
- insert_free_if_dead(op2->tmp_opr());
+ insert_free_if_dead(op2->tmp1_opr());
insert_free_if_dead(res, left);
insert_exchange(left);
@@ -839,6 +841,53 @@ void FpuStackAllocator::handle_op2(LIR_Op2* op2) {
break;
}
+ case lir_pow: {
+ // pow needs two temporary fpu stack slots, so there are two temporary
+ // registers (stored in tmp1 and tmp2 of the operation).
+ // the stack allocator must guarantee that the stack slots are really free,
+ // otherwise there might be a stack overflow.
+ assert(left->is_fpu_register(), "must be");
+ assert(right->is_fpu_register(), "must be");
+ assert(res->is_fpu_register(), "must be");
+
+ assert(op2->tmp1_opr()->is_fpu_register(), "tmp1 is the first temporary register");
+ assert(op2->tmp2_opr()->is_fpu_register(), "tmp2 is the second temporary register");
+ assert(fpu_num(left) != fpu_num(right) && fpu_num(left) != fpu_num(op2->tmp1_opr()) && fpu_num(left) != fpu_num(op2->tmp2_opr()) && fpu_num(left) != fpu_num(res), "need distinct temp registers");
+ assert(fpu_num(right) != fpu_num(op2->tmp1_opr()) && fpu_num(right) != fpu_num(op2->tmp2_opr()) && fpu_num(right) != fpu_num(res), "need distinct temp registers");
+ assert(fpu_num(op2->tmp1_opr()) != fpu_num(op2->tmp2_opr()) && fpu_num(op2->tmp1_opr()) != fpu_num(res), "need distinct temp registers");
+ assert(fpu_num(op2->tmp2_opr()) != fpu_num(res), "need distinct temp registers");
+
+ insert_free_if_dead(op2->tmp1_opr());
+ insert_free_if_dead(op2->tmp2_opr());
+
+ // Must bring both operands to top of stack with following operand ordering:
+ // * fpu stack before pow: ... right left
+ // * fpu stack after pow: ... left
+
+ insert_free_if_dead(res, right);
+
+ if (tos_offset(right) != 1) {
+ insert_exchange(right);
+ insert_exchange(1);
+ }
+ insert_exchange(left);
+ assert(tos_offset(right) == 1, "check");
+ assert(tos_offset(left) == 0, "check");
+
+ new_left = to_fpu_stack_top(left);
+ new_right = to_fpu_stack(right);
+
+ op2->set_fpu_stack_size(sim()->stack_size());
+ assert(sim()->stack_size() <= 6, "at least two stack slots must be free");
+
+ sim()->pop();
+
+ do_rename(right, res);
+
+ new_res = to_fpu_stack_top(res);
+ break;
+ }
+
default: {
assert(false, "missed a fpu-operation");
}
diff --git a/src/cpu/x86/vm/interpreter_x86_32.cpp b/src/cpu/x86/vm/interpreter_x86_32.cpp
index 43a5a18a5..8072354e9 100644
--- a/src/cpu/x86/vm/interpreter_x86_32.cpp
+++ b/src/cpu/x86/vm/interpreter_x86_32.cpp
@@ -181,6 +181,19 @@ address InterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKin
__ push_fTOS();
__ pop_fTOS();
break;
+ case Interpreter::java_lang_math_pow:
+ __ fld_d(Address(rsp, 3*wordSize)); // second argument
+ __ pow_with_fallback(0);
+ // Store to stack to convert 80bit precision back to 64bits
+ __ push_fTOS();
+ __ pop_fTOS();
+ break;
+ case Interpreter::java_lang_math_exp:
+ __ exp_with_fallback(0);
+ // Store to stack to convert 80bit precision back to 64bits
+ __ push_fTOS();
+ __ pop_fTOS();
+ break;
default :
ShouldNotReachHere();
}
diff --git a/src/cpu/x86/vm/interpreter_x86_64.cpp b/src/cpu/x86/vm/interpreter_x86_64.cpp
index 1c124c2f0..761437378 100644
--- a/src/cpu/x86/vm/interpreter_x86_64.cpp
+++ b/src/cpu/x86/vm/interpreter_x86_64.cpp
@@ -271,6 +271,14 @@ address InterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKin
case Interpreter::java_lang_math_log10:
__ flog10();
break;
+ case Interpreter::java_lang_math_pow:
+ __ fld_d(Address(rsp, 3*wordSize)); // second argument (one
+ // empty stack slot)
+ __ pow_with_fallback(0);
+ break;
+ case Interpreter::java_lang_math_exp:
+ __ exp_with_fallback(0);
+ break;
default :
ShouldNotReachHere();
}
diff --git a/src/cpu/x86/vm/stubGenerator_x86_32.cpp b/src/cpu/x86/vm/stubGenerator_x86_32.cpp
index 4d4e66f60..43d51bd38 100644
--- a/src/cpu/x86/vm/stubGenerator_x86_32.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_32.cpp
@@ -2136,11 +2136,23 @@ class StubGenerator: public StubCodeGenerator {
__ trigfunc('t');
__ ret(0);
}
+ {
+ StubCodeMark mark(this, "StubRoutines", "exp");
+ StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
- // The intrinsic version of these seem to return the same value as
- // the strict version.
- StubRoutines::_intrinsic_exp = SharedRuntime::dexp;
- StubRoutines::_intrinsic_pow = SharedRuntime::dpow;
+ __ fld_d(Address(rsp, 4));
+ __ exp_with_fallback(0);
+ __ ret(0);
+ }
+ {
+ StubCodeMark mark(this, "StubRoutines", "pow");
+ StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
+
+ __ fld_d(Address(rsp, 12));
+ __ fld_d(Address(rsp, 4));
+ __ pow_with_fallback(0);
+ __ ret(0);
+ }
}
public:
diff --git a/src/cpu/x86/vm/stubGenerator_x86_64.cpp b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
index 9d9472200..30382b5ab 100644
--- a/src/cpu/x86/vm/stubGenerator_x86_64.cpp
+++ b/src/cpu/x86/vm/stubGenerator_x86_64.cpp
@@ -2928,11 +2928,34 @@ class StubGenerator: public StubCodeGenerator {
__ addq(rsp, 8);
__ ret(0);
}
+ {
+ StubCodeMark mark(this, "StubRoutines", "exp");
+ StubRoutines::_intrinsic_exp = (double (*)(double)) __ pc();
+
+ __ subq(rsp, 8);
+ __ movdbl(Address(rsp, 0), xmm0);
+ __ fld_d(Address(rsp, 0));
+ __ exp_with_fallback(0);
+ __ fstp_d(Address(rsp, 0));
+ __ movdbl(xmm0, Address(rsp, 0));
+ __ addq(rsp, 8);
+ __ ret(0);
+ }
+ {
+ StubCodeMark mark(this, "StubRoutines", "pow");
+ StubRoutines::_intrinsic_pow = (double (*)(double,double)) __ pc();
- // The intrinsic version of these seem to return the same value as
- // the strict version.
- StubRoutines::_intrinsic_exp = SharedRuntime::dexp;
- StubRoutines::_intrinsic_pow = SharedRuntime::dpow;
+ __ subq(rsp, 8);
+ __ movdbl(Address(rsp, 0), xmm1);
+ __ fld_d(Address(rsp, 0));
+ __ movdbl(Address(rsp, 0), xmm0);
+ __ fld_d(Address(rsp, 0));
+ __ pow_with_fallback(0);
+ __ fstp_d(Address(rsp, 0));
+ __ movdbl(xmm0, Address(rsp, 0));
+ __ addq(rsp, 8);
+ __ ret(0);
+ }
}
#undef __
diff --git a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
index 29533832e..fda7980d8 100644
--- a/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
+++ b/src/cpu/x86/vm/templateInterpreter_x86_32.cpp
@@ -1518,7 +1518,9 @@ address AbstractInterpreterGenerator::generate_method_entry(AbstractInterpreter:
case Interpreter::java_lang_math_abs : // fall thru
case Interpreter::java_lang_math_log : // fall thru
case Interpreter::java_lang_math_log10 : // fall thru
- case Interpreter::java_lang_math_sqrt : entry_point = ((InterpreterGenerator*)this)->generate_math_entry(kind); break;
+ case Interpreter::java_lang_math_sqrt : // fall thru
+ case Interpreter::java_lang_math_pow : // fall thru
+ case Interpreter::java_lang_math_exp : entry_point = ((InterpreterGenerator*)this)->generate_math_entry(kind); break;
case Interpreter::java_lang_ref_reference_get
: entry_point = ((InterpreterGenerator*)this)->generate_Reference_get_entry(); break;
default : ShouldNotReachHere(); break;
@@ -1540,7 +1542,9 @@ bool AbstractInterpreter::can_be_compiled(methodHandle m) {
case Interpreter::java_lang_math_abs : // fall thru
case Interpreter::java_lang_math_log : // fall thru
case Interpreter::java_lang_math_log10 : // fall thru
- case Interpreter::java_lang_math_sqrt :
+ case Interpreter::java_lang_math_sqrt : // fall thru
+ case Interpreter::java_lang_math_pow : // fall thru
+ case Interpreter::java_lang_math_exp :
return false;
default:
return true;
diff --git a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
index 110d8ebdf..2e78cd5aa 100644
--- a/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
+++ b/src/cpu/x86/vm/templateInterpreter_x86_64.cpp
@@ -1534,7 +1534,9 @@ address AbstractInterpreterGenerator::generate_method_entry(
case Interpreter::java_lang_math_abs : // fall thru
case Interpreter::java_lang_math_log : // fall thru
case Interpreter::java_lang_math_log10 : // fall thru
- case Interpreter::java_lang_math_sqrt : entry_point = ((InterpreterGenerator*) this)->generate_math_entry(kind); break;
+ case Interpreter::java_lang_math_sqrt : // fall thru
+ case Interpreter::java_lang_math_pow : // fall thru
+ case Interpreter::java_lang_math_exp : entry_point = ((InterpreterGenerator*) this)->generate_math_entry(kind); break;
case Interpreter::java_lang_ref_reference_get
: entry_point = ((InterpreterGenerator*)this)->generate_Reference_get_entry(); break;
default : ShouldNotReachHere(); break;
@@ -1558,7 +1560,9 @@ bool AbstractInterpreter::can_be_compiled(methodHandle m) {
case Interpreter::java_lang_math_abs : // fall thru
case Interpreter::java_lang_math_log : // fall thru
case Interpreter::java_lang_math_log10 : // fall thru
- case Interpreter::java_lang_math_sqrt :
+ case Interpreter::java_lang_math_sqrt : // fall thru
+ case Interpreter::java_lang_math_pow : // fall thru
+ case Interpreter::java_lang_math_exp :
return false;
default:
return true;
diff --git a/src/cpu/x86/vm/x86_32.ad b/src/cpu/x86/vm/x86_32.ad
index 1c5248fec..cd4adccd3 100644
--- a/src/cpu/x86/vm/x86_32.ad
+++ b/src/cpu/x86/vm/x86_32.ad
@@ -2536,45 +2536,6 @@ encode %{
__ fld_d(Address(rsp, 0));
%}
- // Compute X^Y using Intel's fast hardware instructions, if possible.
- // Otherwise return a NaN.
- enc_class pow_exp_core_encoding %{
- // FPR1 holds Y*ln2(X). Compute FPR1 = 2^(Y*ln2(X))
- emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xC0); // fdup = fld st(0) Q Q
- emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xFC); // frndint int(Q) Q
- emit_opcode(cbuf,0xDC); emit_opcode(cbuf,0xE9); // fsub st(1) -= st(0); int(Q) frac(Q)
- emit_opcode(cbuf,0xDB); // FISTP [ESP] frac(Q)
- emit_opcode(cbuf,0x1C);
- emit_d8(cbuf,0x24);
- emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xF0); // f2xm1 2^frac(Q)-1
- emit_opcode(cbuf,0xD9); emit_opcode(cbuf,0xE8); // fld1 1 2^frac(Q)-1
- emit_opcode(cbuf,0xDE); emit_opcode(cbuf,0xC1); // faddp 2^frac(Q)
- emit_opcode(cbuf,0x8B); // mov rax,[esp+0]=int(Q)
- encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 0, false);
- emit_opcode(cbuf,0xC7); // mov rcx,0xFFFFF800 - overflow mask
- emit_rm(cbuf, 0x3, 0x0, ECX_enc);
- emit_d32(cbuf,0xFFFFF800);
- emit_opcode(cbuf,0x81); // add rax,1023 - the double exponent bias
- emit_rm(cbuf, 0x3, 0x0, EAX_enc);
- emit_d32(cbuf,1023);
- emit_opcode(cbuf,0x8B); // mov rbx,eax
- emit_rm(cbuf, 0x3, EBX_enc, EAX_enc);
- emit_opcode(cbuf,0xC1); // shl rax,20 - Slide to exponent position
- emit_rm(cbuf,0x3,0x4,EAX_enc);
- emit_d8(cbuf,20);
- emit_opcode(cbuf,0x85); // test rbx,ecx - check for overflow
- emit_rm(cbuf, 0x3, EBX_enc, ECX_enc);
- emit_opcode(cbuf,0x0F); emit_opcode(cbuf,0x45); // CMOVne rax,ecx - overflow; stuff NAN into EAX
- emit_rm(cbuf, 0x3, EAX_enc, ECX_enc);
- emit_opcode(cbuf,0x89); // mov [esp+4],eax - Store as part of double word
- encode_RegMem(cbuf, EAX_enc, ESP_enc, 0x4, 0, 4, false);
- emit_opcode(cbuf,0xC7); // mov [esp+0],0 - [ESP] = (double)(1<<int(Q)) = 2^int(Q)
- encode_RegMem(cbuf, 0x0, ESP_enc, 0x4, 0, 0, false);
- emit_d32(cbuf,0);
- emit_opcode(cbuf,0xDC); // fmul dword st(0),[esp+0]; FPR1 = 2^int(Q)*2^frac(Q) = 2^Q
- encode_RegMem(cbuf, 0x1, ESP_enc, 0x4, 0, 0, false);
- %}
-
enc_class Push_Result_Mod_DPR( regDPR src) %{
if ($src$$reg != FPR1L_enc) {
// fincstp
@@ -10100,162 +10061,68 @@ instruct sqrtDPR_reg(regDPR dst, regDPR src) %{
ins_pipe( pipe_slow );
%}
-instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
+instruct powDPR_reg(regDPR X, regDPR1 Y, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
predicate (UseSSE<=1);
match(Set Y (PowD X Y)); // Raise X to the Yth power
- effect(KILL rax, KILL rbx, KILL rcx);
- format %{ "SUB ESP,8\t\t# Fast-path POW encoding\n\t"
- "FLD_D $X\n\t"
- "FYL2X \t\t\t# Q=Y*ln2(X)\n\t"
-
- "FDUP \t\t\t# Q Q\n\t"
- "FRNDINT\t\t\t# int(Q) Q\n\t"
- "FSUB ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
- "FISTP dword [ESP]\n\t"
- "F2XM1 \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
- "FLD1 \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
- "FADDP \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
- "MOV EAX,[ESP]\t# Pick up int(Q)\n\t"
- "MOV ECX,0xFFFFF800\t# Overflow mask\n\t"
- "ADD EAX,1023\t\t# Double exponent bias\n\t"
- "MOV EBX,EAX\t\t# Preshifted biased expo\n\t"
- "SHL EAX,20\t\t# Shift exponent into place\n\t"
- "TEST EBX,ECX\t\t# Check for overflow\n\t"
- "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
- "MOV [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
- "MOV [ESP+0],0\n\t"
- "FMUL ST(0),[ESP+0]\t# Scale\n\t"
-
- "ADD ESP,8"
- %}
- ins_encode( push_stack_temp_qword,
- Push_Reg_DPR(X),
- Opcode(0xD9), Opcode(0xF1), // fyl2x
- pow_exp_core_encoding,
- pop_stack_temp_qword);
+ effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
+ format %{ "fast_pow $X $Y -> $Y // KILL $rax, $rcx, $rdx" %}
+ ins_encode %{
+ __ subptr(rsp, 8);
+ __ fld_s($X$$reg - 1);
+ __ fast_pow();
+ __ addptr(rsp, 8);
+ %}
ins_pipe( pipe_slow );
%}
-instruct powD_reg(regD dst, regD src0, regD src1, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx ) %{
+instruct powD_reg(regD dst, regD src0, regD src1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
predicate (UseSSE>=2);
match(Set dst (PowD src0 src1)); // Raise src0 to the src1'th power
- effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx );
- format %{ "SUB ESP,8\t\t# Fast-path POW encoding\n\t"
- "MOVSD [ESP],$src1\n\t"
- "FLD FPR1,$src1\n\t"
- "MOVSD [ESP],$src0\n\t"
- "FLD FPR1,$src0\n\t"
- "FYL2X \t\t\t# Q=Y*ln2(X)\n\t"
-
- "FDUP \t\t\t# Q Q\n\t"
- "FRNDINT\t\t\t# int(Q) Q\n\t"
- "FSUB ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
- "FISTP dword [ESP]\n\t"
- "F2XM1 \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
- "FLD1 \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
- "FADDP \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
- "MOV EAX,[ESP]\t# Pick up int(Q)\n\t"
- "MOV ECX,0xFFFFF800\t# Overflow mask\n\t"
- "ADD EAX,1023\t\t# Double exponent bias\n\t"
- "MOV EBX,EAX\t\t# Preshifted biased expo\n\t"
- "SHL EAX,20\t\t# Shift exponent into place\n\t"
- "TEST EBX,ECX\t\t# Check for overflow\n\t"
- "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
- "MOV [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
- "MOV [ESP+0],0\n\t"
- "FMUL ST(0),[ESP+0]\t# Scale\n\t"
-
- "FST_D [ESP]\n\t"
- "MOVSD $dst,[ESP]\n\t"
- "ADD ESP,8"
- %}
- ins_encode( push_stack_temp_qword,
- push_xmm_to_fpr1(src1),
- push_xmm_to_fpr1(src0),
- Opcode(0xD9), Opcode(0xF1), // fyl2x
- pow_exp_core_encoding,
- Push_ResultD(dst) );
+ effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
+ format %{ "fast_pow $src0 $src1 -> $dst // KILL $rax, $rcx, $rdx" %}
+ ins_encode %{
+ __ subptr(rsp, 8);
+ __ movdbl(Address(rsp, 0), $src1$$XMMRegister);
+ __ fld_d(Address(rsp, 0));
+ __ movdbl(Address(rsp, 0), $src0$$XMMRegister);
+ __ fld_d(Address(rsp, 0));
+ __ fast_pow();
+ __ fstp_d(Address(rsp, 0));
+ __ movdbl($dst$$XMMRegister, Address(rsp, 0));
+ __ addptr(rsp, 8);
+ %}
ins_pipe( pipe_slow );
%}
-instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
+instruct expDPR_reg(regDPR1 dpr1, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
predicate (UseSSE<=1);
match(Set dpr1 (ExpD dpr1));
- effect(KILL rax, KILL rbx, KILL rcx);
- format %{ "SUB ESP,8\t\t# Fast-path EXP encoding"
- "FLDL2E \t\t\t# Ld log2(e) X\n\t"
- "FMULP \t\t\t# Q=X*log2(e)\n\t"
-
- "FDUP \t\t\t# Q Q\n\t"
- "FRNDINT\t\t\t# int(Q) Q\n\t"
- "FSUB ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
- "FISTP dword [ESP]\n\t"
- "F2XM1 \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
- "FLD1 \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
- "FADDP \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
- "MOV EAX,[ESP]\t# Pick up int(Q)\n\t"
- "MOV ECX,0xFFFFF800\t# Overflow mask\n\t"
- "ADD EAX,1023\t\t# Double exponent bias\n\t"
- "MOV EBX,EAX\t\t# Preshifted biased expo\n\t"
- "SHL EAX,20\t\t# Shift exponent into place\n\t"
- "TEST EBX,ECX\t\t# Check for overflow\n\t"
- "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
- "MOV [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
- "MOV [ESP+0],0\n\t"
- "FMUL ST(0),[ESP+0]\t# Scale\n\t"
-
- "ADD ESP,8"
- %}
- ins_encode( push_stack_temp_qword,
- Opcode(0xD9), Opcode(0xEA), // fldl2e
- Opcode(0xDE), Opcode(0xC9), // fmulp
- pow_exp_core_encoding,
- pop_stack_temp_qword);
- ins_pipe( pipe_slow );
-%}
-
-instruct expD_reg(regD dst, regD src, regDPR1 tmp1, eAXRegI rax, eBXRegI rbx, eCXRegI rcx) %{
+ effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
+ format %{ "fast_exp $dpr1 -> $dpr1 // KILL $rax, $rcx, $rdx" %}
+ ins_encode %{
+ __ fast_exp();
+ %}
+ ins_pipe( pipe_slow );
+%}
+
+instruct expD_reg(regD dst, regD src, eAXRegI rax, eDXRegI rdx, eCXRegI rcx, eFlagsReg cr) %{
predicate (UseSSE>=2);
match(Set dst (ExpD src));
- effect(KILL tmp1, KILL rax, KILL rbx, KILL rcx);
- format %{ "SUB ESP,8\t\t# Fast-path EXP encoding\n\t"
- "MOVSD [ESP],$src\n\t"
- "FLDL2E \t\t\t# Ld log2(e) X\n\t"
- "FMULP \t\t\t# Q=X*log2(e) X\n\t"
-
- "FDUP \t\t\t# Q Q\n\t"
- "FRNDINT\t\t\t# int(Q) Q\n\t"
- "FSUB ST(1),ST(0)\t# int(Q) frac(Q)\n\t"
- "FISTP dword [ESP]\n\t"
- "F2XM1 \t\t\t# 2^frac(Q)-1 int(Q)\n\t"
- "FLD1 \t\t\t# 1 2^frac(Q)-1 int(Q)\n\t"
- "FADDP \t\t\t# 2^frac(Q) int(Q)\n\t" // could use FADD [1.000] instead
- "MOV EAX,[ESP]\t# Pick up int(Q)\n\t"
- "MOV ECX,0xFFFFF800\t# Overflow mask\n\t"
- "ADD EAX,1023\t\t# Double exponent bias\n\t"
- "MOV EBX,EAX\t\t# Preshifted biased expo\n\t"
- "SHL EAX,20\t\t# Shift exponent into place\n\t"
- "TEST EBX,ECX\t\t# Check for overflow\n\t"
- "CMOVne EAX,ECX\t\t# If overflow, stuff NaN into EAX\n\t"
- "MOV [ESP+4],EAX\t# Marshal 64-bit scaling double\n\t"
- "MOV [ESP+0],0\n\t"
- "FMUL ST(0),[ESP+0]\t# Scale\n\t"
-
- "FST_D [ESP]\n\t"
- "MOVSD $dst,[ESP]\n\t"
- "ADD ESP,8"
- %}
- ins_encode( Push_SrcD(src),
- Opcode(0xD9), Opcode(0xEA), // fldl2e
- Opcode(0xDE), Opcode(0xC9), // fmulp
- pow_exp_core_encoding,
- Push_ResultD(dst) );
+ effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
+ format %{ "fast_exp $dst -> $src // KILL $rax, $rcx, $rdx" %}
+ ins_encode %{
+ __ subptr(rsp, 8);
+ __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+ __ fld_d(Address(rsp, 0));
+ __ fast_exp();
+ __ fstp_d(Address(rsp, 0));
+ __ movdbl($dst$$XMMRegister, Address(rsp, 0));
+ __ addptr(rsp, 8);
+ %}
ins_pipe( pipe_slow );
%}
-
-
instruct log10DPR_reg(regDPR1 dst, regDPR1 src) %{
predicate (UseSSE<=1);
// The source Double operand on FPU stack
diff --git a/src/cpu/x86/vm/x86_64.ad b/src/cpu/x86/vm/x86_64.ad
index 81a01ff9e..80928c3fa 100644
--- a/src/cpu/x86/vm/x86_64.ad
+++ b/src/cpu/x86/vm/x86_64.ad
@@ -9823,7 +9823,39 @@ instruct logD_reg(regD dst) %{
ins_pipe( pipe_slow );
%}
+instruct powD_reg(regD dst, regD src0, regD src1, rax_RegI rax, rdx_RegI rdx, rcx_RegI rcx, rFlagsReg cr) %{
+ match(Set dst (PowD src0 src1)); // Raise src0 to the src1'th power
+ effect(KILL rax, KILL rdx, KILL rcx, KILL cr);
+ format %{ "fast_pow $src0 $src1 -> $dst // KILL $rax, $rcx, $rdx" %}
+ ins_encode %{
+ __ subptr(rsp, 8);
+ __ movdbl(Address(rsp, 0), $src1$$XMMRegister);
+ __ fld_d(Address(rsp, 0));
+ __ movdbl(Address(rsp, 0), $src0$$XMMRegister);
+ __ fld_d(Address(rsp, 0));
+ __ fast_pow();
+ __ fstp_d(Address(rsp, 0));
+ __ movdbl($dst$$XMMRegister, Address(rsp, 0));
+ __ addptr(rsp, 8);
+ %}
+ ins_pipe( pipe_slow );
+%}
+instruct expD_reg(regD dst, regD src, rax_RegI rax, rdx_RegI rdx, rcx_RegI rcx, rFlagsReg cr) %{
+ match(Set dst (ExpD src));
+ effect(KILL rax, KILL rcx, KILL rdx, KILL cr);
+ format %{ "fast_exp $dst -> $src // KILL $rax, $rcx, $rdx" %}
+ ins_encode %{
+ __ subptr(rsp, 8);
+ __ movdbl(Address(rsp, 0), $src$$XMMRegister);
+ __ fld_d(Address(rsp, 0));
+ __ fast_exp();
+ __ fstp_d(Address(rsp, 0));
+ __ movdbl($dst$$XMMRegister, Address(rsp, 0));
+ __ addptr(rsp, 8);
+ %}
+ ins_pipe( pipe_slow );
+%}
//----------Arithmetic Conversion Instructions---------------------------------