aboutsummaryrefslogtreecommitdiff
path: root/gcc/config/i386/i386-expand.c
diff options
context:
space:
mode:
authorThomas Koenig <tkoenig@gcc.gnu.org>2021-09-13 19:49:49 +0200
committerThomas Koenig <tkoenig@gcc.gnu.org>2021-09-13 19:49:49 +0200
commitb18a97e5dd0935e1c4a626c230f21457d0aad3d5 (patch)
treec1818f41af6fe780deafb6cd6a183f32085fe654 /gcc/config/i386/i386-expand.c
parente76a53644c9d70e998c0d050e9a456af388c6b61 (diff)
Merged current trunk to branch.
Diffstat (limited to 'gcc/config/i386/i386-expand.c')
-rw-r--r--gcc/config/i386/i386-expand.c2284
1 files changed, 1771 insertions, 513 deletions
diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 6e08fd32726..e117afb16b8 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1988-2020 Free Software Foundation, Inc.
+/* Copyright (C) 1988-2021 Free Software Foundation, Inc.
This file is part of GCC.
@@ -154,9 +154,13 @@ split_double_mode (machine_mode mode, rtx operands[],
lo_half[num] = simplify_gen_subreg (half_mode, op,
GET_MODE (op) == VOIDmode
? mode : GET_MODE (op), 0);
- hi_half[num] = simplify_gen_subreg (half_mode, op,
- GET_MODE (op) == VOIDmode
- ? mode : GET_MODE (op), byte);
+
+ rtx tmp = simplify_gen_subreg (half_mode, op,
+ GET_MODE (op) == VOIDmode
+ ? mode : GET_MODE (op), byte);
+ /* simplify_gen_subreg will return NULL RTX for the
+ high half of the paradoxical subreg. */
+ hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
}
}
}
@@ -186,6 +190,82 @@ ix86_expand_clear (rtx dest)
emit_insn (tmp);
}
+/* Return true if V can be broadcasted from an integer of WIDTH bits
+ which is returned in VAL_BROADCAST. Otherwise, return false. */
+
+static bool
+ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
+ HOST_WIDE_INT &val_broadcast)
+{
+ wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
+ val_broadcast = wi::extract_uhwi (val, 0, width);
+ for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
+ {
+ HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
+ if (val_broadcast != each)
+ return false;
+ }
+ val_broadcast = sext_hwi (val_broadcast, width);
+ return true;
+}
+
+/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE. */
+
+static rtx
+ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
+{
+ /* Don't use integer vector broadcast if we can't move from GPR to SSE
+ register directly. */
+ if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+ return nullptr;
+
+ /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
+ broadcast only if vector broadcast is available. */
+ if (!TARGET_AVX
+ || !CONST_WIDE_INT_P (op)
+ || standard_sse_constant_p (op, mode))
+ return nullptr;
+
+ HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
+ HOST_WIDE_INT val_broadcast;
+ scalar_int_mode broadcast_mode;
+ if (TARGET_AVX2
+ && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
+ val_broadcast))
+ broadcast_mode = QImode;
+ else if (TARGET_AVX2
+ && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
+ val_broadcast))
+ broadcast_mode = HImode;
+ else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
+ val_broadcast))
+ broadcast_mode = SImode;
+ else if (TARGET_64BIT
+ && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
+ val_broadcast))
+ broadcast_mode = DImode;
+ else
+ return nullptr;
+
+ /* Check if OP can be broadcasted from VAL. */
+ for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
+ if (val != CONST_WIDE_INT_ELT (op, i))
+ return nullptr;
+
+ unsigned int nunits = (GET_MODE_SIZE (mode)
+ / GET_MODE_SIZE (broadcast_mode));
+ machine_mode vector_mode;
+ if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
+ gcc_unreachable ();
+ rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
+ bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
+ target,
+ GEN_INT (val_broadcast));
+ gcc_assert (ok);
+ target = lowpart_subreg (mode, target, vector_mode);
+ return target;
+}
+
void
ix86_expand_move (machine_mode mode, rtx operands[])
{
@@ -343,20 +423,29 @@ ix86_expand_move (machine_mode mode, rtx operands[])
&& optimize)
op1 = copy_to_mode_reg (mode, op1);
- if (can_create_pseudo_p ()
- && CONST_DOUBLE_P (op1))
+ if (can_create_pseudo_p ())
{
- /* If we are loading a floating point constant to a register,
- force the value to memory now, since we'll get better code
- out the back end. */
+ if (CONST_DOUBLE_P (op1))
+ {
+ /* If we are loading a floating point constant to a
+ register, force the value to memory now, since we'll
+ get better code out the back end. */
- op1 = validize_mem (force_const_mem (mode, op1));
- if (!register_operand (op0, mode))
+ op1 = validize_mem (force_const_mem (mode, op1));
+ if (!register_operand (op0, mode))
+ {
+ rtx temp = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (temp, op1));
+ emit_move_insn (op0, temp);
+ return;
+ }
+ }
+ else if (GET_MODE_SIZE (mode) >= 16)
{
- rtx temp = gen_reg_rtx (mode);
- emit_insn (gen_rtx_SET (temp, op1));
- emit_move_insn (op0, temp);
- return;
+ rtx tmp = ix86_convert_const_wide_int_to_broadcast
+ (GET_MODE (op0), op1);
+ if (tmp != nullptr)
+ op1 = tmp;
}
}
}
@@ -364,6 +453,70 @@ ix86_expand_move (machine_mode mode, rtx operands[])
emit_insn (gen_rtx_SET (op0, op1));
}
+/* OP is a memref of CONST_VECTOR, return scalar constant mem
+ if CONST_VECTOR is a vec_duplicate, else return NULL. */
+static rtx
+ix86_broadcast_from_constant (machine_mode mode, rtx op)
+{
+ int nunits = GET_MODE_NUNITS (mode);
+ if (nunits < 2)
+ return nullptr;
+
+ /* Don't use integer vector broadcast if we can't move from GPR to SSE
+ register directly. */
+ if (!TARGET_INTER_UNIT_MOVES_TO_VEC
+ && INTEGRAL_MODE_P (mode))
+ return nullptr;
+
+ /* Convert CONST_VECTOR to a non-standard SSE constant integer
+ broadcast only if vector broadcast is available. */
+ if (!(TARGET_AVX2
+ || (TARGET_AVX
+ && (GET_MODE_INNER (mode) == SImode
+ || GET_MODE_INNER (mode) == DImode))
+ || FLOAT_MODE_P (mode))
+ || standard_sse_constant_p (op, mode))
+ return nullptr;
+
+ /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
+ We can still put 64-bit integer constant in memory when
+ avx512 embed broadcast is available. */
+ if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
+ && (!TARGET_AVX512F
+ || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
+ return nullptr;
+
+ if (GET_MODE_INNER (mode) == TImode)
+ return nullptr;
+
+ rtx constant = get_pool_constant (XEXP (op, 0));
+ if (GET_CODE (constant) != CONST_VECTOR)
+ return nullptr;
+
+ /* There could be some rtx like
+ (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+ but with "*.LC1" refer to V2DI constant vector. */
+ if (GET_MODE (constant) != mode)
+ {
+ constant = simplify_subreg (mode, constant, GET_MODE (constant),
+ 0);
+ if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
+ return nullptr;
+ }
+
+ rtx first = XVECEXP (constant, 0, 0);
+
+ for (int i = 1; i < nunits; ++i)
+ {
+ rtx tmp = XVECEXP (constant, 0, i);
+ /* Vector duplicate value. */
+ if (!rtx_equal_p (tmp, first))
+ return nullptr;
+ }
+
+ return first;
+}
+
void
ix86_expand_vector_move (machine_mode mode, rtx operands[])
{
@@ -403,7 +556,39 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
}
else
- op1 = validize_mem (force_const_mem (mode, op1));
+ {
+ machine_mode mode = GET_MODE (op0);
+ rtx tmp = ix86_convert_const_wide_int_to_broadcast
+ (mode, op1);
+ if (tmp == nullptr)
+ op1 = validize_mem (force_const_mem (mode, op1));
+ else
+ op1 = tmp;
+ }
+ }
+
+ if (can_create_pseudo_p ()
+ && GET_MODE_SIZE (mode) >= 16
+ && VECTOR_MODE_P (mode)
+ && (MEM_P (op1)
+ && SYMBOL_REF_P (XEXP (op1, 0))
+ && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
+ {
+ rtx first = ix86_broadcast_from_constant (mode, op1);
+ if (first != nullptr)
+ {
+ /* Broadcast to XMM/YMM/ZMM register from an integer
+ constant or scalar mem. */
+ op1 = gen_reg_rtx (mode);
+ if (FLOAT_MODE_P (mode)
+ || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
+ first = force_const_mem (GET_MODE_INNER (mode), first);
+ bool ok = ix86_expand_vector_init_duplicate (false, mode,
+ op1, first);
+ gcc_assert (ok);
+ emit_move_insn (op0, op1);
+ return;
+ }
}
/* We need to check memory alignment for SSE mode since attribute
@@ -419,7 +604,11 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
arguments in memory. */
if (!register_operand (op0, mode)
&& !register_operand (op1, mode))
- op1 = force_reg (mode, op1);
+ {
+ rtx scratch = ix86_gen_scratch_sse_rtx (mode);
+ emit_move_insn (scratch, op1);
+ op1 = scratch;
+ }
tmp[0] = op0; tmp[1] = op1;
ix86_expand_vector_move_misalign (mode, tmp);
@@ -431,7 +620,9 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
&& !register_operand (op0, mode)
&& !register_operand (op1, mode))
{
- emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
+ rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
+ emit_move_insn (tmp, op1);
+ emit_move_insn (op0, tmp);
return;
}
@@ -487,6 +678,10 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
extract = gen_avx_vextractf128v32qi;
mode = V16QImode;
break;
+ case E_V16HFmode:
+ extract = gen_avx_vextractf128v16hf;
+ mode = V8HFmode;
+ break;
case E_V8SFmode:
extract = gen_avx_vextractf128v8sf;
mode = V4SFmode;
@@ -759,6 +954,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
switch (mode)
{
+ case E_V4QImode:
case E_V8QImode:
sse_mode = V16QImode;
double_sse_mode = V32QImode;
@@ -775,6 +971,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
break;
case E_V4HImode:
+ case E_V2HImode:
sse_mode = V8HImode;
double_sse_mode = V16HImode;
mask = gen_rtx_PARALLEL (VOIDmode,
@@ -794,6 +991,15 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
GEN_INT (1), GEN_INT (5)));
break;
+ case E_V2SFmode:
+ sse_mode = V4SFmode;
+ double_sse_mode = V8SFmode;
+ mask = gen_rtx_PARALLEL (VOIDmode,
+ gen_rtvec (4,
+ GEN_INT (0), GEN_INT (4),
+ GEN_INT (1), GEN_INT (5)));
+ break;
+
default:
gcc_unreachable ();
}
@@ -808,14 +1014,36 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
rtx insn = gen_rtx_SET (dest, op2);
emit_insn (insn);
+ /* Move high bits to low bits. */
if (high_p)
{
- /* Move bits 64:127 to bits 0:63. */
- mask = gen_rtx_PARALLEL (VOIDmode,
- gen_rtvec (4, GEN_INT (2), GEN_INT (3),
- GEN_INT (0), GEN_INT (0)));
- dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
- op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
+ if (sse_mode == V4SFmode)
+ {
+ mask = gen_rtx_PARALLEL (VOIDmode,
+ gen_rtvec (4, GEN_INT (2), GEN_INT (3),
+ GEN_INT (4), GEN_INT (5)));
+ op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
+ op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
+ }
+ else
+ {
+ int sz = GET_MODE_SIZE (mode);
+
+ if (sz == 4)
+ mask = gen_rtx_PARALLEL (VOIDmode,
+ gen_rtvec (4, GEN_INT (1), GEN_INT (0),
+ GEN_INT (0), GEN_INT (1)));
+ else if (sz == 8)
+ mask = gen_rtx_PARALLEL (VOIDmode,
+ gen_rtvec (4, GEN_INT (2), GEN_INT (3),
+ GEN_INT (0), GEN_INT (1)));
+ else
+ gcc_unreachable ();
+
+ dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
+ op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
+ }
+
insn = gen_rtx_SET (dest, op1);
emit_insn (insn);
}
@@ -1291,6 +1519,9 @@ find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
return false;
}
+/* INSN_UID of the last insn emitted by zero store peephole2s. */
+int ix86_last_zero_store_uid;
+
/* Split lea instructions into a sequence of instructions
which are executed on ALU to avoid AGU stalls.
It is assumed that it is allowed to clobber flags register
@@ -1348,9 +1579,10 @@ ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
if (regno0 != regno2)
emit_insn (gen_rtx_SET (target, parts.index));
- /* Use shift for scaling. */
- ix86_emit_binop (ASHIFT, mode, target,
- GEN_INT (exact_log2 (parts.scale)));
+ /* Use shift for scaling, but emit it as MULT instead
+ to avoid it being immediately peephole2 optimized back
+ into lea. */
+ ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
if (parts.base)
ix86_emit_binop (PLUS, mode, target, parts.base);
@@ -1549,6 +1781,8 @@ ix86_expand_convert_uns_sixf_sse (rtx, rtx)
gcc_unreachable ();
}
+static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
+
/* Convert an unsigned SImode value into a DFmode. Only currently used
for SSE, but applicable anywhere. */
@@ -1568,6 +1802,11 @@ ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
x = const_double_from_real_value (TWO31r, DFmode);
x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
+
+ /* Remove the sign with FE_DOWNWARD, where x - x = -0.0. */
+ if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
+ x = ix86_expand_sse_fabs (x, NULL);
+
if (x != target)
emit_move_insn (target, x);
}
@@ -1616,12 +1855,21 @@ ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
fp_lo = gen_reg_rtx (SFmode);
emit_insn (gen_floatsisf2 (fp_hi, int_hi));
emit_insn (gen_floatsisf2 (fp_lo, int_lo));
- fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
- 0, OPTAB_DIRECT);
- fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
- 0, OPTAB_DIRECT);
- if (!rtx_equal_p (target, fp_hi))
- emit_move_insn (target, fp_hi);
+ if (TARGET_FMA)
+ {
+ x = validize_mem (force_const_mem (SFmode, x));
+ fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
+ emit_move_insn (target, fp_hi);
+ }
+ else
+ {
+ fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
+ 0, OPTAB_DIRECT);
+ fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
+ 0, OPTAB_DIRECT);
+ if (!rtx_equal_p (target, fp_hi))
+ emit_move_insn (target, fp_hi);
+ }
}
/* floatunsv{4,8}siv{4,8}sf2 expander. Expand code to convert
@@ -1653,12 +1901,20 @@ ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
real_ldexp (&TWO16r, &dconst1, 16);
tmp[5] = const_double_from_real_value (TWO16r, SFmode);
tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
- tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
- OPTAB_DIRECT);
- tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
- OPTAB_DIRECT);
- if (tmp[7] != target)
- emit_move_insn (target, tmp[7]);
+ if (TARGET_FMA)
+ {
+ tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
+ emit_move_insn (target, tmp[6]);
+ }
+ else
+ {
+ tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
+ NULL_RTX, 1, OPTAB_DIRECT);
+ tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
+ target, 1, OPTAB_DIRECT);
+ if (tmp[7] != target)
+ emit_move_insn (target, tmp[7]);
+ }
}
/* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
@@ -1863,13 +2119,9 @@ void
ix86_expand_copysign (rtx operands[])
{
machine_mode mode, vmode;
- rtx dest, op0, op1, mask;
-
- dest = operands[0];
- op0 = operands[1];
- op1 = operands[2];
+ rtx dest, op0, op1, mask, op2, op3;
- mode = GET_MODE (dest);
+ mode = GET_MODE (operands[0]);
if (mode == SFmode)
vmode = V4SFmode;
@@ -1880,136 +2132,40 @@ ix86_expand_copysign (rtx operands[])
else
gcc_unreachable ();
- mask = ix86_build_signbit_mask (vmode, 0, 0);
-
- if (CONST_DOUBLE_P (op0))
- {
- if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
- op0 = simplify_unary_operation (ABS, mode, op0, mode);
-
- if (mode == SFmode || mode == DFmode)
- {
- if (op0 == CONST0_RTX (mode))
- op0 = CONST0_RTX (vmode);
- else
- {
- rtx v = ix86_build_const_vector (vmode, false, op0);
-
- op0 = force_reg (vmode, v);
- }
- }
- else if (op0 != CONST0_RTX (mode))
- op0 = force_reg (mode, op0);
-
- emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
- }
- else
- {
- rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
-
- emit_insn (gen_copysign3_var
- (mode, dest, NULL_RTX, op0, op1, nmask, mask));
- }
-}
-
-/* Deconstruct a copysign operation into bit masks. Operand 0 is known to
- be a constant, and so has already been expanded into a vector constant. */
-
-void
-ix86_split_copysign_const (rtx operands[])
-{
- machine_mode mode, vmode;
- rtx dest, op0, mask, x;
-
- dest = operands[0];
- op0 = operands[1];
- mask = operands[3];
-
- mode = GET_MODE (dest);
- vmode = GET_MODE (mask);
-
- dest = lowpart_subreg (vmode, dest, mode);
- x = gen_rtx_AND (vmode, dest, mask);
- emit_insn (gen_rtx_SET (dest, x));
-
- if (op0 != CONST0_RTX (vmode))
- {
- x = gen_rtx_IOR (vmode, dest, op0);
- emit_insn (gen_rtx_SET (dest, x));
- }
-}
-
-/* Deconstruct a copysign operation into bit masks. Operand 0 is variable,
- so we have to do two masks. */
-
-void
-ix86_split_copysign_var (rtx operands[])
-{
- machine_mode mode, vmode;
- rtx dest, scratch, op0, op1, mask, nmask, x;
-
- dest = operands[0];
- scratch = operands[1];
- op0 = operands[2];
- op1 = operands[3];
- nmask = operands[4];
- mask = operands[5];
-
- mode = GET_MODE (dest);
- vmode = GET_MODE (mask);
-
- if (rtx_equal_p (op0, op1))
+ if (rtx_equal_p (operands[1], operands[2]))
{
- /* Shouldn't happen often (it's useless, obviously), but when it does
- we'd generate incorrect code if we continue below. */
- emit_move_insn (dest, op0);
+ emit_move_insn (operands[0], operands[1]);
return;
}
- if (REG_P (mask) && REGNO (dest) == REGNO (mask)) /* alternative 0 */
- {
- gcc_assert (REGNO (op1) == REGNO (scratch));
-
- x = gen_rtx_AND (vmode, scratch, mask);
- emit_insn (gen_rtx_SET (scratch, x));
+ dest = lowpart_subreg (vmode, operands[0], mode);
+ op1 = lowpart_subreg (vmode, operands[2], mode);
+ mask = ix86_build_signbit_mask (vmode, 0, 0);
- dest = mask;
- op0 = lowpart_subreg (vmode, op0, mode);
- x = gen_rtx_NOT (vmode, dest);
- x = gen_rtx_AND (vmode, x, op0);
- emit_insn (gen_rtx_SET (dest, x));
- }
- else
+ if (CONST_DOUBLE_P (operands[1]))
{
- if (REGNO (op1) == REGNO (scratch)) /* alternative 1,3 */
- {
- x = gen_rtx_AND (vmode, scratch, mask);
- }
- else /* alternative 2,4 */
+ op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
+ /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a. */
+ if (op0 == CONST0_RTX (mode))
{
- gcc_assert (REGNO (mask) == REGNO (scratch));
- op1 = lowpart_subreg (vmode, op1, mode);
- x = gen_rtx_AND (vmode, scratch, op1);
+ emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1));
+ return;
}
- emit_insn (gen_rtx_SET (scratch, x));
- if (REGNO (op0) == REGNO (dest)) /* alternative 1,2 */
- {
- dest = lowpart_subreg (vmode, op0, mode);
- x = gen_rtx_AND (vmode, dest, nmask);
- }
- else /* alternative 3,4 */
- {
- gcc_assert (REGNO (nmask) == REGNO (dest));
- dest = nmask;
- op0 = lowpart_subreg (vmode, op0, mode);
- x = gen_rtx_AND (vmode, dest, op0);
- }
- emit_insn (gen_rtx_SET (dest, x));
+ if (GET_MODE_SIZE (mode) < 16)
+ op0 = ix86_build_const_vector (vmode, false, op0);
+ op0 = force_reg (vmode, op0);
}
-
- x = gen_rtx_IOR (vmode, dest, scratch);
- emit_insn (gen_rtx_SET (dest, x));
+ else
+ op0 = lowpart_subreg (vmode, operands[1], mode);
+
+ op2 = gen_reg_rtx (vmode);
+ op3 = gen_reg_rtx (vmode);
+ emit_move_insn (op2, gen_rtx_AND (vmode,
+ gen_rtx_NOT (vmode, mask),
+ op0));
+ emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
+ emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3));
}
/* Expand an xorsign operation. */
@@ -2018,7 +2174,7 @@ void
ix86_expand_xorsign (rtx operands[])
{
machine_mode mode, vmode;
- rtx dest, op0, op1, mask;
+ rtx dest, op0, op1, mask, x, temp;
dest = operands[0];
op0 = operands[1];
@@ -2033,32 +2189,17 @@ ix86_expand_xorsign (rtx operands[])
else
gcc_unreachable ();
+ temp = gen_reg_rtx (vmode);
mask = ix86_build_signbit_mask (vmode, 0, 0);
- emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
-}
-
-/* Deconstruct an xorsign operation into bit masks. */
-
-void
-ix86_split_xorsign (rtx operands[])
-{
- machine_mode mode, vmode;
- rtx dest, op0, mask, x;
-
- dest = operands[0];
- op0 = operands[1];
- mask = operands[3];
+ op1 = lowpart_subreg (vmode, op1, mode);
+ x = gen_rtx_AND (vmode, op1, mask);
+ emit_insn (gen_rtx_SET (temp, x));
- mode = GET_MODE (dest);
- vmode = GET_MODE (mask);
+ op0 = lowpart_subreg (vmode, op0, mode);
+ x = gen_rtx_XOR (vmode, temp, op0);
dest = lowpart_subreg (vmode, dest, mode);
- x = gen_rtx_AND (vmode, dest, mask);
- emit_insn (gen_rtx_SET (dest, x));
-
- op0 = lowpart_subreg (vmode, op0, mode);
- x = gen_rtx_XOR (vmode, dest, op0);
emit_insn (gen_rtx_SET (dest, x));
}
@@ -2096,6 +2237,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
switch (mode)
{
+ case E_HFmode:
case E_SFmode:
case E_DFmode:
case E_XFmode:
@@ -2409,7 +2551,7 @@ ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
bool unordered_compare = ix86_unordered_fp_compare (code);
rtx op0 = *pop0, op1 = *pop1;
machine_mode op_mode = GET_MODE (op0);
- bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
+ bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
/* All of the unordered compare instructions only work on registers.
The same is true of the fcomi compare instructions. The XFmode
@@ -2650,6 +2792,14 @@ ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
machine_mode cmpmode;
rtx tmp, flags;
+ /* Swap operands to emit carry flag comparison. */
+ if ((code == GTU || code == LEU)
+ && nonimmediate_operand (op1, VOIDmode))
+ {
+ std::swap (op0, op1);
+ code = swap_condition (code);
+ }
+
cmpmode = SELECT_CC_MODE (code, op0, op1);
flags = gen_rtx_REG (cmpmode, FLAGS_REG);
@@ -3469,6 +3619,37 @@ ix86_valid_mask_cmp_mode (machine_mode mode)
return vector_size == 64 || TARGET_AVX512VL;
}
+/* Return true if integer mask comparison should be used. */
+static bool
+ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
+ rtx op_true, rtx op_false)
+{
+ int vector_size = GET_MODE_SIZE (mode);
+
+ if (vector_size < 16)
+ return false;
+ else if (vector_size == 64)
+ return true;
+
+ /* When op_true is NULL, op_false must be NULL, or vice versa. */
+ gcc_assert (!op_true == !op_false);
+
+ /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
+ vector dest is required. */
+ if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
+ return false;
+
+ /* Exclude those that could be optimized in ix86_expand_sse_movcc. */
+ if (op_false == CONST0_RTX (mode)
+ || op_true == CONST0_RTX (mode)
+ || (INTEGRAL_MODE_P (mode)
+ && (op_true == CONSTM1_RTX (mode)
+ || op_false == CONSTM1_RTX (mode))))
+ return false;
+
+ return true;
+}
+
/* Expand an SSE comparison. Return the register with the result. */
static rtx
@@ -3485,7 +3666,7 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
bool maskcmp = false;
rtx x;
- if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
+ if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
{
unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
maskcmp = true;
@@ -3496,7 +3677,7 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
- int (*op1_predicate)(rtx, machine_mode)
+ bool (*op1_predicate)(rtx, machine_mode)
= VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
if (!op1_predicate (cmp_op1, cmp_ops_mode))
@@ -3517,7 +3698,7 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
- if (cmp_mode != mode && !maskcmp)
+ if (cmp_mode != mode)
{
x = force_reg (cmp_ops_mode, x);
convert_move (dest, x, false);
@@ -3544,9 +3725,6 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
return;
}
- /* In AVX512F the result of comparison is an integer mask. */
- bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
-
rtx t2, t3, x;
/* If we have an integer mask and FP value then we need
@@ -3557,8 +3735,11 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
cmp = gen_rtx_SUBREG (mode, cmp, 0);
}
- if (maskcmp)
+ /* In AVX512F the result of comparison is an integer mask. */
+ if (mode != cmpmode
+ && GET_MODE_CLASS (cmpmode) == MODE_INT)
{
+ gcc_assert (ix86_valid_mask_cmp_mode (mode));
/* Using vector move with mask register. */
cmp = force_reg (cmpmode, cmp);
/* Optimize for mask zero. */
@@ -3568,17 +3749,11 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
? force_reg (mode, op_false) : op_false);
if (op_true == CONST0_RTX (mode))
{
- rtx (*gen_not) (rtx, rtx);
- switch (cmpmode)
- {
- case E_QImode: gen_not = gen_knotqi; break;
- case E_HImode: gen_not = gen_knothi; break;
- case E_SImode: gen_not = gen_knotsi; break;
- case E_DImode: gen_not = gen_knotdi; break;
- default: gcc_unreachable ();
- }
rtx n = gen_reg_rtx (cmpmode);
- emit_insn (gen_not (n, cmp));
+ if (cmpmode == E_DImode && !TARGET_64BIT)
+ emit_insn (gen_knotdi (n, cmp));
+ else
+ emit_insn (gen_rtx_SET (n, gen_rtx_fmt_e (NOT, cmpmode, cmp)));
cmp = n;
/* Reverse op_true op_false. */
std::swap (op_true, op_false);
@@ -3620,7 +3795,8 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
{
op_true = force_reg (mode, op_true);
- if (!nonimmediate_operand (op_false, mode))
+ if (GET_MODE_SIZE (mode) < 16
+ || !nonimmediate_operand (op_false, mode))
op_false = force_reg (mode, op_false);
emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
@@ -3639,6 +3815,13 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
switch (mode)
{
+ case E_V2SFmode:
+ if (TARGET_SSE4_1)
+ {
+ gen = gen_mmx_blendvps;
+ op_true = force_reg (mode, op_true);
+ }
+ break;
case E_V4SFmode:
if (TARGET_SSE4_1)
gen = gen_sse4_1_blendvps;
@@ -3661,8 +3844,38 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
op_true = force_reg (mode, op_true);
}
break;
+ case E_V8QImode:
+ case E_V4HImode:
+ case E_V2SImode:
+ if (TARGET_SSE4_1)
+ {
+ op_true = force_reg (mode, op_true);
+
+ gen = gen_mmx_pblendvb64;
+ if (mode != V8QImode)
+ d = gen_reg_rtx (V8QImode);
+ op_false = gen_lowpart (V8QImode, op_false);
+ op_true = gen_lowpart (V8QImode, op_true);
+ cmp = gen_lowpart (V8QImode, cmp);
+ }
+ break;
+ case E_V4QImode:
+ case E_V2HImode:
+ if (TARGET_SSE4_1)
+ {
+ op_true = force_reg (mode, op_true);
+
+ gen = gen_mmx_pblendvb32;
+ if (mode != V4QImode)
+ d = gen_reg_rtx (V4QImode);
+ op_false = gen_lowpart (V4QImode, op_false);
+ op_true = gen_lowpart (V4QImode, op_true);
+ cmp = gen_lowpart (V4QImode, cmp);
+ }
+ break;
case E_V16QImode:
case E_V8HImode:
+ case E_V8HFmode:
case E_V4SImode:
case E_V2DImode:
if (TARGET_SSE4_1)
@@ -3685,6 +3898,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
break;
case E_V32QImode:
case E_V16HImode:
+ case E_V16HFmode:
case E_V8SImode:
case E_V4DImode:
if (TARGET_AVX2)
@@ -3704,6 +3918,9 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
case E_V32HImode:
gen = gen_avx512bw_blendmv32hi;
break;
+ case E_V32HFmode:
+ gen = gen_avx512bw_blendmv32hf;
+ break;
case E_V16SImode:
gen = gen_avx512f_blendmv16si;
break;
@@ -3824,7 +4041,7 @@ ix86_expand_fp_movcc (rtx operands[])
rtx op0 = XEXP (operands[1], 0);
rtx op1 = XEXP (operands[1], 1);
- if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
+ if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
{
machine_mode cmode;
@@ -4022,7 +4239,7 @@ ix86_expand_fp_vec_cmp (rtx operands[])
}
else
cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
- operands[1], operands[2]);
+ NULL, NULL);
if (operands[0] != cmp)
emit_move_insn (operands[0], cmp);
@@ -4042,12 +4259,12 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
/* XOP supports all of the comparisons on all 128-bit vector int types. */
if (TARGET_XOP
- && (mode == V16QImode || mode == V8HImode
- || mode == V4SImode || mode == V2DImode))
+ && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+ && GET_MODE_SIZE (mode) <= 16)
;
/* AVX512F supports all of the comparsions
on all 128/256/512-bit vector int types. */
- else if (ix86_valid_mask_cmp_mode (mode))
+ else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
;
else
{
@@ -4167,16 +4384,44 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
else if (code == GT && TARGET_SSE4_1)
gen = gen_sminv16qi3;
break;
+ case E_V8QImode:
+ if (code == GTU && TARGET_SSE2)
+ gen = gen_uminv8qi3;
+ else if (code == GT && TARGET_SSE4_1)
+ gen = gen_sminv8qi3;
+ break;
+ case E_V4QImode:
+ if (code == GTU && TARGET_SSE2)
+ gen = gen_uminv4qi3;
+ else if (code == GT && TARGET_SSE4_1)
+ gen = gen_sminv4qi3;
+ break;
case E_V8HImode:
if (code == GTU && TARGET_SSE4_1)
gen = gen_uminv8hi3;
else if (code == GT && TARGET_SSE2)
gen = gen_sminv8hi3;
break;
+ case E_V4HImode:
+ if (code == GTU && TARGET_SSE4_1)
+ gen = gen_uminv4hi3;
+ else if (code == GT && TARGET_SSE2)
+ gen = gen_sminv4hi3;
+ break;
+ case E_V2HImode:
+ if (code == GTU && TARGET_SSE4_1)
+ gen = gen_uminv2hi3;
+ else if (code == GT && TARGET_SSE2)
+ gen = gen_sminv2hi3;
+ break;
case E_V4SImode:
if (TARGET_SSE4_1)
gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
break;
+ case E_V2SImode:
+ if (TARGET_SSE4_1)
+ gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
+ break;
case E_V2DImode:
if (TARGET_AVX512VL)
{
@@ -4217,6 +4462,7 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
case E_V8SImode:
case E_V4DImode:
case E_V4SImode:
+ case E_V2SImode:
case E_V2DImode:
{
rtx t1, t2, mask;
@@ -4241,7 +4487,11 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
case E_V32QImode:
case E_V16HImode:
case E_V16QImode:
+ case E_V8QImode:
+ case E_V4QImode:
case E_V8HImode:
+ case E_V4HImode:
+ case E_V2HImode:
/* Perform a parallel unsigned saturating subtraction. */
x = gen_reg_rtx (mode);
emit_insn (gen_rtx_SET
@@ -4431,6 +4681,18 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
switch (mode)
{
+ case E_V16QImode:
+ if (TARGET_AVX512VL && TARGET_AVX512VBMI)
+ gen = gen_avx512vl_vpermt2varv16qi3;
+ break;
+ case E_V32QImode:
+ if (TARGET_AVX512VL && TARGET_AVX512VBMI)
+ gen = gen_avx512vl_vpermt2varv32qi3;
+ break;
+ case E_V64QImode:
+ if (TARGET_AVX512VBMI)
+ gen = gen_avx512bw_vpermt2varv64qi3;
+ break;
case E_V8HImode:
if (TARGET_AVX512VL && TARGET_AVX512BW)
gen = gen_avx512vl_vpermt2varv8hi3;
@@ -4439,10 +4701,6 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
if (TARGET_AVX512VL && TARGET_AVX512BW)
gen = gen_avx512vl_vpermt2varv16hi3;
break;
- case E_V64QImode:
- if (TARGET_AVX512VBMI)
- gen = gen_avx512bw_vpermt2varv64qi3;
- break;
case E_V32HImode:
if (TARGET_AVX512BW)
gen = gen_avx512bw_vpermt2varv32hi3;
@@ -5022,6 +5280,24 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
else
unpack = gen_sse4_1_sign_extendv2siv2di2;
break;
+ case E_V8QImode:
+ if (unsigned_p)
+ unpack = gen_sse4_1_zero_extendv4qiv4hi2;
+ else
+ unpack = gen_sse4_1_sign_extendv4qiv4hi2;
+ break;
+ case E_V4HImode:
+ if (unsigned_p)
+ unpack = gen_sse4_1_zero_extendv2hiv2si2;
+ else
+ unpack = gen_sse4_1_sign_extendv2hiv2si2;
+ break;
+ case E_V4QImode:
+ if (unsigned_p)
+ unpack = gen_sse4_1_zero_extendv2qiv2hi2;
+ else
+ unpack = gen_sse4_1_sign_extendv2qiv2hi2;
+ break;
default:
gcc_unreachable ();
}
@@ -5033,10 +5309,30 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
}
else if (high_p)
{
- /* Shift higher 8 bytes to lower 8 bytes. */
- tmp = gen_reg_rtx (V1TImode);
- emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
- GEN_INT (64)));
+ switch (GET_MODE_SIZE (imode))
+ {
+ case 16:
+ /* Shift higher 8 bytes to lower 8 bytes. */
+ tmp = gen_reg_rtx (V1TImode);
+ emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
+ GEN_INT (64)));
+ break;
+ case 8:
+ /* Shift higher 4 bytes to lower 4 bytes. */
+ tmp = gen_reg_rtx (V1DImode);
+ emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
+ GEN_INT (32)));
+ break;
+ case 4:
+ /* Shift higher 2 bytes to lower 2 bytes. */
+ tmp = gen_reg_rtx (V1SImode);
+ emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
+ GEN_INT (16)));
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
tmp = gen_lowpart (imode, tmp);
}
else
@@ -5068,6 +5364,24 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
else
unpack = gen_vec_interleave_lowv4si;
break;
+ case E_V8QImode:
+ if (high_p)
+ unpack = gen_mmx_punpckhbw;
+ else
+ unpack = gen_mmx_punpcklbw;
+ break;
+ case E_V4HImode:
+ if (high_p)
+ unpack = gen_mmx_punpckhwd;
+ else
+ unpack = gen_mmx_punpcklwd;
+ break;
+ case E_V4QImode:
+ if (high_p)
+ unpack = gen_mmx_punpckhbw_low;
+ else
+ unpack = gen_mmx_punpcklbw_low;
+ break;
default:
gcc_unreachable ();
}
@@ -5084,6 +5398,45 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
}
}
+/* Return true if mem is pool constant which contains a const_vector
+ perm index, assign the index to PERM. */
+bool
+ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
+{
+ machine_mode mode = GET_MODE (mem);
+ int nelt = GET_MODE_NUNITS (mode);
+
+ if (!INTEGRAL_MODE_P (mode))
+ return false;
+
+ /* Needs to be constant pool. */
+ if (!(MEM_P (mem))
+ || !SYMBOL_REF_P (XEXP (mem, 0))
+ || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
+ return false;
+
+ rtx constant = get_pool_constant (XEXP (mem, 0));
+
+ if (GET_CODE (constant) != CONST_VECTOR)
+ return false;
+
+ /* There could be some rtx like
+ (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+ but with "*.LC1" refer to V2DI constant vector. */
+ if (GET_MODE (constant) != mode)
+ {
+ constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
+
+ if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
+ return false;
+ }
+
+ for (int i = 0; i != nelt; i++)
+ perm[i] = UINTVAL (XVECEXP (constant, 0, i));
+
+ return true;
+}
+
/* Split operands 0 and 1 into half-mode parts. Similar to split_double_mode,
but works for floating pointer parameters and nonoffsetable memories.
For pushes, it returns just stack offsets; the values will be saved
@@ -5954,6 +6307,7 @@ expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
/* If possible, it is shorter to use rep movs.
TODO: Maybe it is better to move this logic to decide_alg. */
if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
+ && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
&& (!issetmem || orig_value == const0_rtx))
mode = SImode;
@@ -6962,7 +7316,12 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
else if (!any_alg_usable_p)
break;
}
- else if (alg_usable_p (candidate, memset, have_as))
+ else if (alg_usable_p (candidate, memset, have_as)
+ && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
+ && candidate == rep_prefix_1_byte
+ /* NB: If min_size != max_size, size is
+ unknown. */
+ && min_size != max_size))
{
*noalign = algs->size[i].noalign;
return candidate;
@@ -7027,7 +7386,7 @@ decide_alignment (int align,
desired_align = GET_MODE_SIZE (move_mode);
/* PentiumPro has special logic triggering for 8 byte aligned blocks.
copying whole cacheline at once. */
- if (TARGET_PENTIUMPRO
+ if (TARGET_CPU_P (PENTIUMPRO)
&& (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
desired_align = 8;
@@ -8027,6 +8386,7 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
pop = NULL;
gcc_assert (!TARGET_64BIT || !pop);
+ rtx addr = XEXP (fnaddr, 0);
if (TARGET_MACHO && !TARGET_64BIT)
{
#if TARGET_MACHO
@@ -8039,7 +8399,6 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
/* Static functions and indirect calls don't need the pic register. Also,
check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
it an indirect call. */
- rtx addr = XEXP (fnaddr, 0);
if (flag_pic
&& GET_CODE (addr) == SYMBOL_REF
&& !SYMBOL_REF_LOCAL_P (addr))
@@ -8202,6 +8561,20 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
}
}
+ if (TARGET_MACHO && TARGET_64BIT && !sibcall
+ && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
+ || !fndecl || TREE_PUBLIC (fndecl)))
+ {
+ /* We allow public functions defined in a TU to bind locally for PIC
+ code (the default) on 64bit Mach-O.
+ If such functions are not inlined, we cannot tell at compile-time if
+ they will be called via the lazy symbol resolver (this can depend on
+ options given at link-time). Therefore, we must assume that the lazy
+ resolver could be used which clobbers R11 and R10. */
+ clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
+ clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
+ }
+
if (vec_len > 1)
call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
rtx_insn *call_insn = emit_call_insn (call);
@@ -8634,11 +9007,6 @@ ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
if (VECTOR_MODE_P (mode1))
op1 = safe_vector_operand (op1, mode1);
- /* Swap operands if we have a comparison that isn't available in
- hardware. */
- if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
- std::swap (op0, op1);
-
target = gen_reg_rtx (SImode);
emit_move_insn (target, const0_rtx);
target = gen_rtx_SUBREG (QImode, target, 0);
@@ -9159,12 +9527,14 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case FLOAT128_FTYPE_FLOAT128_FLOAT128:
case V16QI_FTYPE_V16QI_V16QI:
case V16QI_FTYPE_V8HI_V8HI:
+ case V16HF_FTYPE_V16HF_V16HF:
case V16SF_FTYPE_V16SF_V16SF:
case V8QI_FTYPE_V8QI_V8QI:
case V8QI_FTYPE_V4HI_V4HI:
case V8HI_FTYPE_V8HI_V8HI:
case V8HI_FTYPE_V16QI_V16QI:
case V8HI_FTYPE_V4SI_V4SI:
+ case V8HF_FTYPE_V8HF_V8HF:
case V8SF_FTYPE_V8SF_V8SF:
case V8SF_FTYPE_V8SF_V8SI:
case V8DF_FTYPE_V8DF_V8DF:
@@ -9574,6 +9944,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
+ case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
@@ -9591,6 +9962,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
+ case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
@@ -9598,6 +9970,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
+ case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
@@ -9626,14 +9999,17 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case UQI_FTYPE_V8SI_V8SI_INT_UQI:
case QI_FTYPE_V4DF_V4DF_INT_UQI:
case QI_FTYPE_V8SF_V8SF_INT_UQI:
+ case UHI_FTYPE_V16HF_V16HF_INT_UHI:
case UQI_FTYPE_V2DI_V2DI_INT_UQI:
case UQI_FTYPE_V4SI_V4SI_INT_UQI:
case UQI_FTYPE_V2DF_V2DF_INT_UQI:
case UQI_FTYPE_V4SF_V4SF_INT_UQI:
+ case UQI_FTYPE_V8HF_V8HF_INT_UQI:
case UDI_FTYPE_V64QI_V64QI_INT_UDI:
case USI_FTYPE_V32QI_V32QI_INT_USI:
case UHI_FTYPE_V16QI_V16QI_INT_UHI:
case USI_FTYPE_V32HI_V32HI_INT_USI:
+ case USI_FTYPE_V32HF_V32HF_INT_USI:
case UHI_FTYPE_V16HI_V16HI_INT_UHI:
case UQI_FTYPE_V8HI_V8HI_INT_UQI:
nargs = 4;
@@ -9917,6 +10293,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
case CODE_FOR_avx512f_cmpv16sf3_mask:
case CODE_FOR_avx512f_vmcmpv2df3_mask:
case CODE_FOR_avx512f_vmcmpv4sf3_mask:
+ case CODE_FOR_avx512bw_cmpv32hf3_mask:
+ case CODE_FOR_avx512vl_cmpv16hf3_mask:
+ case CODE_FOR_avx512fp16_cmpv8hf3_mask:
error ("the last argument must be a 5-bit immediate");
return const0_rtx;
@@ -10274,6 +10653,8 @@ ix86_expand_round_builtin (const struct builtin_description *d,
case INT_FTYPE_V4SF_INT:
nargs = 2;
break;
+ case V32HF_FTYPE_V32HF_V32HF_INT:
+ case V8HF_FTYPE_V8HF_V8HF_INT:
case V4SF_FTYPE_V4SF_UINT_INT:
case V4SF_FTYPE_V4SF_UINT64_INT:
case V2DF_FTYPE_V2DF_UINT64_INT:
@@ -10314,12 +10695,14 @@ ix86_expand_round_builtin (const struct builtin_description *d,
case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
+ case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
+ case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
nargs = 5;
break;
case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
@@ -10333,6 +10716,8 @@ ix86_expand_round_builtin (const struct builtin_description *d,
case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
+ case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
+ case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
nargs_constant = 3;
nargs = 5;
break;
@@ -10388,6 +10773,8 @@ ix86_expand_round_builtin (const struct builtin_description *d,
case CODE_FOR_avx512f_cmpv16sf3_mask_round:
case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
+ case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
+ case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
error ("the immediate argument must be a 5-bit immediate");
return const0_rtx;
default:
@@ -10832,11 +11219,12 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
op = fixup_modeless_constant (op, mode);
- /* NB: 3-operands load implied it's a mask load,
+ /* NB: 3-operands load implied it's a mask load or v{p}expand*,
and that mask operand shoud be at the end.
Keep all-ones mask which would be simplified by the expander. */
if (nargs == 3 && i == 2 && klass == load
- && constm1_operand (op, mode))
+ && constm1_operand (op, mode)
+ && insn_p->operand[i].predicate (op, mode))
;
else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
op = copy_to_mode_reg (mode, op);
@@ -11407,10 +11795,24 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
if (target == 0)
target = gen_reg_rtx (QImode);
- pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
- const0_rtx);
- emit_insn (gen_rtx_SET (target, pat));
+ /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
+ error occurs. Then the output should be cleared for safety. */
+ rtx_code_label *ok_label;
+ rtx tmp;
+ tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
+ pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
+ ok_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
+ true, ok_label);
+ /* Usually the runtime error seldom occur, so predict OK path as
+ hotspot to optimize it as fallthrough block. */
+ predict_jump (REG_BR_PROB_BASE * 90 / 100);
+
+ emit_insn (gen_rtx_SET (op1, const0_rtx));
+
+ emit_label (ok_label);
+ emit_insn (gen_rtx_SET (target, pat));
emit_insn (gen_rtx_SET (op0, op1));
return target;
@@ -11465,8 +11867,17 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
if (target == 0)
target = gen_reg_rtx (QImode);
- pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
- const0_rtx);
+ tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
+ pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
+ ok_label = gen_label_rtx ();
+ emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
+ true, ok_label);
+ predict_jump (REG_BR_PROB_BASE * 90 / 100);
+
+ for (i = 0; i < 8; i++)
+ emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
+
+ emit_label (ok_label);
emit_insn (gen_rtx_SET (target, pat));
for (i = 0; i < 8; i++)
@@ -13206,6 +13617,14 @@ rdseed_step:
target);
}
+ if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
+ && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
+ {
+ i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
+ return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
+ target);
+ }
+
if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
&& fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
{
@@ -13552,7 +13971,7 @@ static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
/* A subroutine of ix86_expand_vector_init. Store into TARGET a vector
with all elements equal to VAR. Return true if successful. */
-static bool
+bool
ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
rtx target, rtx val)
{
@@ -13595,7 +14014,21 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
}
goto widen;
+ case E_V2HImode:
+ if (TARGET_SSE2)
+ {
+ rtx x;
+
+ val = gen_lowpart (SImode, val);
+ x = gen_rtx_TRUNCATE (HImode, val);
+ x = gen_rtx_VEC_DUPLICATE (mode, x);
+ emit_insn (gen_rtx_SET (target, x));
+ return true;
+ }
+ return false;
+
case E_V8QImode:
+ case E_V4QImode:
if (!mmx_ok)
return false;
goto widen;
@@ -13652,10 +14085,17 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
wsmode = GET_MODE_INNER (wvmode);
val = convert_modes (wsmode, smode, val, true);
- x = expand_simple_binop (wsmode, ASHIFT, val,
- GEN_INT (GET_MODE_BITSIZE (smode)),
- NULL_RTX, 1, OPTAB_LIB_WIDEN);
- val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
+
+ if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
+ emit_insn (gen_insv_1 (wsmode, val, val));
+ else
+ {
+ x = expand_simple_binop (wsmode, ASHIFT, val,
+ GEN_INT (GET_MODE_BITSIZE (smode)),
+ NULL_RTX, 1, OPTAB_LIB_WIDEN);
+ val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
+ OPTAB_LIB_WIDEN);
+ }
x = gen_reg_rtx (wvmode);
ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
@@ -13698,6 +14138,11 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
}
return true;
+ case E_V8HFmode:
+ case E_V16HFmode:
+ case E_V32HFmode:
+ return ix86_vector_duplicate_value (mode, target, val);
+
default:
return false;
}
@@ -13741,6 +14186,9 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
case E_V4HImode:
use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
break;
+ case E_V4QImode:
+ use_vector_set = TARGET_SSE4_1;
+ break;
case E_V32QImode:
case E_V16HImode:
use_vector_set = TARGET_AVX;
@@ -13779,6 +14227,18 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
gen_vec_set_0 = gen_vec_setv8di_0;
break;
+ case E_V8HFmode:
+ use_vector_set = TARGET_AVX512FP16 && one_var == 0;
+ gen_vec_set_0 = gen_vec_setv8hf_0;
+ break;
+ case E_V16HFmode:
+ use_vector_set = TARGET_AVX512FP16 && one_var == 0;
+ gen_vec_set_0 = gen_vec_setv16hf_0;
+ break;
+ case E_V32HFmode:
+ use_vector_set = TARGET_AVX512FP16 && one_var == 0;
+ gen_vec_set_0 = gen_vec_setv32hf_0;
+ break;
default:
break;
}
@@ -13928,6 +14388,8 @@ ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
if (!TARGET_64BIT)
return false;
/* FALLTHRU */
+ case E_V8HFmode:
+ case E_V16HFmode:
case E_V4DFmode:
case E_V8SFmode:
case E_V8SImode:
@@ -13949,6 +14411,10 @@ ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
break;
wmode = V4HImode;
goto widen;
+ case E_V4QImode:
+ if (TARGET_SSE4_1)
+ break;
+ wmode = V2HImode;
widen:
/* There's no way to set one QImode entry easily. Combine
the variable value with its adjacent constant value, and
@@ -14004,6 +14470,9 @@ ix86_expand_vector_init_concat (machine_mode mode,
case 2:
switch (mode)
{
+ case E_V32HFmode:
+ half_mode = V16HFmode;
+ break;
case E_V16SImode:
half_mode = V8SImode;
break;
@@ -14016,6 +14485,9 @@ ix86_expand_vector_init_concat (machine_mode mode,
case E_V8DFmode:
half_mode = V4DFmode;
break;
+ case E_V16HFmode:
+ half_mode = V8HFmode;
+ break;
case E_V8SImode:
half_mode = V4SImode;
break;
@@ -14158,13 +14630,22 @@ ix86_expand_vector_init_interleave (machine_mode mode,
{
machine_mode first_imode, second_imode, third_imode, inner_mode;
int i, j;
- rtx op0, op1;
+ rtx op, op0, op1;
rtx (*gen_load_even) (rtx, rtx, rtx);
rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
switch (mode)
{
+ case E_V8HFmode:
+ gen_load_even = gen_vec_setv8hf;
+ gen_interleave_first_low = gen_vec_interleave_lowv4si;
+ gen_interleave_second_low = gen_vec_interleave_lowv2di;
+ inner_mode = HFmode;
+ first_imode = V4SImode;
+ second_imode = V2DImode;
+ third_imode = VOIDmode;
+ break;
case E_V8HImode:
gen_load_even = gen_vec_setv8hi;
gen_interleave_first_low = gen_vec_interleave_lowv4si;
@@ -14189,9 +14670,19 @@ ix86_expand_vector_init_interleave (machine_mode mode,
for (i = 0; i < n; i++)
{
+ op = ops [i + i];
+ if (inner_mode == HFmode)
+ {
+ /* Convert HFmode to HImode. */
+ op1 = gen_reg_rtx (HImode);
+ op1 = gen_rtx_SUBREG (HImode, force_reg (HFmode, op), 0);
+ op = gen_reg_rtx (HImode);
+ emit_move_insn (op, op1);
+ }
+
/* Extend the odd elment to SImode using a paradoxical SUBREG. */
op0 = gen_reg_rtx (SImode);
- emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
+ emit_move_insn (op0, gen_lowpart (SImode, op));
/* Insert the SImode value as low element of V4SImode vector. */
op1 = gen_reg_rtx (V4SImode);
@@ -14328,6 +14819,10 @@ ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
half_mode = V8HImode;
goto half;
+ case E_V16HFmode:
+ half_mode = V8HFmode;
+ goto half;
+
half:
n = GET_MODE_NUNITS (mode);
for (i = 0; i < n; i++)
@@ -14351,6 +14846,11 @@ half:
half_mode = V16HImode;
goto quarter;
+ case E_V32HFmode:
+ quarter_mode = V8HFmode;
+ half_mode = V16HFmode;
+ goto quarter;
+
quarter:
n = GET_MODE_NUNITS (mode);
for (i = 0; i < n; i++)
@@ -14387,6 +14887,9 @@ quarter:
move from GPR to SSE register directly. */
if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
break;
+ /* FALLTHRU */
+
+ case E_V8HFmode:
n = GET_MODE_NUNITS (mode);
for (i = 0; i < n; i++)
@@ -14396,6 +14899,9 @@ quarter:
case E_V4HImode:
case E_V8QImode:
+
+ case E_V2HImode:
+ case E_V4QImode:
break;
default:
@@ -14404,12 +14910,14 @@ quarter:
{
int i, j, n_elts, n_words, n_elt_per_word;
- machine_mode inner_mode;
+ machine_mode tmp_mode, inner_mode;
rtx words[4], shift;
+ tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
+
inner_mode = GET_MODE_INNER (mode);
n_elts = GET_MODE_NUNITS (mode);
- n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+ n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
n_elt_per_word = n_elts / n_words;
shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
@@ -14420,15 +14928,15 @@ quarter:
for (j = 0; j < n_elt_per_word; ++j)
{
rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
- elt = convert_modes (word_mode, inner_mode, elt, true);
+ elt = convert_modes (tmp_mode, inner_mode, elt, true);
if (j == 0)
word = elt;
else
{
- word = expand_simple_binop (word_mode, ASHIFT, word, shift,
+ word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
word, 1, OPTAB_LIB_WIDEN);
- word = expand_simple_binop (word_mode, IOR, word, elt,
+ word = expand_simple_binop (tmp_mode, IOR, word, elt,
word, 1, OPTAB_LIB_WIDEN);
}
}
@@ -14442,14 +14950,14 @@ quarter:
{
rtx tmp = gen_reg_rtx (mode);
emit_clobber (tmp);
- emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
- emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
+ emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
+ emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
emit_move_insn (target, tmp);
}
else if (n_words == 4)
{
rtx tmp = gen_reg_rtx (V4SImode);
- gcc_assert (word_mode == SImode);
+ gcc_assert (tmp_mode == SImode);
vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
emit_move_insn (target, gen_lowpart (mode, tmp));
@@ -14482,11 +14990,15 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
{
rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
- if (inner_mode == QImode || inner_mode == HImode)
+ if (inner_mode == QImode
+ || inner_mode == HImode
+ || inner_mode == TImode)
{
unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
- mode = mode_for_vector (SImode, n_bits / 4).require ();
- inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
+ scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
+ n_bits /= GET_MODE_SIZE (elt_mode);
+ mode = mode_for_vector (elt_mode, n_bits).require ();
+ inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
ops[0] = gen_lowpart (inner_mode, ops[0]);
ops[1] = gen_lowpart (inner_mode, ops[1]);
subtarget = gen_reg_rtx (mode);
@@ -14613,6 +15125,9 @@ ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
case E_V8DFmode:
cmp_mode = V8DImode;
break;
+ case E_V2SFmode:
+ cmp_mode = V2SImode;
+ break;
case E_V4SFmode:
cmp_mode = V4SImode;
break;
@@ -14622,6 +15137,16 @@ ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
case E_V16SFmode:
cmp_mode = V16SImode;
break;
+ /* TARGET_AVX512FP16 implies TARGET_AVX512BW. */
+ case E_V8HFmode:
+ cmp_mode = V8HImode;
+ break;
+ case E_V16HFmode:
+ cmp_mode = V16HImode;
+ break;
+ case E_V32HFmode:
+ cmp_mode = V32HImode;
+ break;
default:
gcc_unreachable ();
}
@@ -14634,9 +15159,11 @@ ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
idxv = gen_reg_rtx (cmp_mode);
idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
- ok = ix86_expand_vector_init_duplicate (false, mode, valv, val);
+ ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
+ mode, valv, val);
gcc_assert (ok);
- ok = ix86_expand_vector_init_duplicate (false, cmp_mode, idxv, idx_tmp);
+ ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
+ cmp_mode, idxv, idx_tmp);
gcc_assert (ok);
vec[0] = target;
vec[1] = valv;
@@ -14656,23 +15183,25 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
machine_mode half_mode;
bool use_vec_merge = false;
rtx tmp;
- static rtx (*gen_extract[6][2]) (rtx, rtx)
+ static rtx (*gen_extract[7][2]) (rtx, rtx)
= {
{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
- { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
+ { gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
+ { gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf }
};
- static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
+ static rtx (*gen_insert[7][2]) (rtx, rtx, rtx)
= {
{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
- { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
+ { gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
+ { gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
};
int i, j, n;
machine_mode mmode = VOIDmode;
@@ -14839,7 +15368,12 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
}
return;
+ case E_V8HFmode:
+ use_vec_merge = true;
+ break;
+
case E_V8HImode:
+ case E_V2HImode:
use_vec_merge = TARGET_SSE2;
break;
case E_V4HImode:
@@ -14847,6 +15381,7 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
break;
case E_V16QImode:
+ case E_V4QImode:
use_vec_merge = TARGET_SSE4_1;
break;
@@ -14860,6 +15395,12 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
n = 16;
goto half;
+ case E_V16HFmode:
+ half_mode = V8HFmode;
+ j = 6;
+ n = 8;
+ goto half;
+
case E_V16HImode:
half_mode = V8HImode;
j = 1;
@@ -14940,6 +15481,13 @@ half:
}
break;
+ case E_V32HFmode:
+ if (TARGET_AVX512BW)
+ {
+ mmode = SImode;
+ gen_blendm = gen_avx512bw_blendmv32hf;
+ }
+ break;
case E_V32HImode:
if (TARGET_AVX512BW)
{
@@ -15146,6 +15694,7 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
break;
case E_V8HImode:
+ case E_V2HImode:
use_vec_extr = TARGET_SSE2;
break;
case E_V4HImode:
@@ -15166,6 +15715,9 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
return;
}
break;
+ case E_V4QImode:
+ use_vec_extr = TARGET_SSE4_1;
+ break;
case E_V8SFmode:
if (TARGET_AVX)
@@ -15307,6 +15859,28 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
ix86_expand_vector_extract (false, target, tmp, elt & 3);
return;
+ case E_V32HFmode:
+ tmp = gen_reg_rtx (V16HFmode);
+ if (elt < 16)
+ emit_insn (gen_vec_extract_lo_v32hf (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v32hf (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 15);
+ return;
+
+ case E_V16HFmode:
+ tmp = gen_reg_rtx (V8HFmode);
+ if (elt < 8)
+ emit_insn (gen_vec_extract_lo_v16hf (tmp, vec));
+ else
+ emit_insn (gen_vec_extract_hi_v16hf (tmp, vec));
+ ix86_expand_vector_extract (false, target, tmp, elt & 7);
+ return;
+
+ case E_V8HFmode:
+ use_vec_extr = true;
+ break;
+
case E_V8QImode:
use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
/* ??? Could extract the appropriate HImode element and shift. */
@@ -16902,7 +17476,9 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
- else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+ else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
+ || GET_MODE_SIZE (vmode) == 8
+ || GET_MODE_SIZE (vmode) == 4))
;
else
return false;
@@ -16935,6 +17511,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
case E_V8SFmode:
case E_V2DFmode:
case E_V4SFmode:
+ case E_V4HImode:
case E_V8HImode:
case E_V8SImode:
case E_V32HImode:
@@ -16951,6 +17528,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
vmode = V8HImode;
goto do_subreg;
+ case E_V2SImode:
+ for (i = 0; i < 2; ++i)
+ mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
+ vmode = V4HImode;
+ goto do_subreg;
+
case E_V4SImode:
for (i = 0; i < 4; ++i)
mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
@@ -16972,7 +17555,11 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
vperm = force_reg (vmode, vperm);
- if (GET_MODE_SIZE (vmode) == 16)
+ if (GET_MODE_SIZE (vmode) == 4)
+ emit_insn (gen_mmx_pblendvb32 (target, op0, op1, vperm));
+ else if (GET_MODE_SIZE (vmode) == 8)
+ emit_insn (gen_mmx_pblendvb64 (target, op0, op1, vperm));
+ else if (GET_MODE_SIZE (vmode) == 16)
emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
else
emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
@@ -16992,6 +17579,26 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
op1 = gen_lowpart (vmode, op1);
break;
+ case E_V8QImode:
+ for (i = 0; i < 8; i += 2)
+ if (d->perm[i] + 1 != d->perm[i + 1])
+ goto use_pblendvb;
+
+ for (i = 0; i < 4; ++i)
+ mask |= (d->perm[i * 2] >= 8) << i;
+ vmode = V4HImode;
+ goto do_subreg;
+
+ case E_V4QImode:
+ for (i = 0; i < 4; i += 2)
+ if (d->perm[i] + 1 != d->perm[i + 1])
+ goto use_pblendvb;
+
+ for (i = 0; i < 2; ++i)
+ mask |= (d->perm[i * 2] >= 4) << i;
+ vmode = V2HImode;
+ goto do_subreg;
+
case E_V32QImode:
/* See if bytes move in pairs. If not, vpblendvb must be used. */
for (i = 0; i < 32; i += 2)
@@ -17153,6 +17760,59 @@ expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
return true;
}
+/* For V*[QHS]Imode permutations, check if the same permutation
+ can't be performed in a 2x, 4x or 8x wider inner mode. */
+
+static bool
+canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
+ struct expand_vec_perm_d *nd)
+{
+ int i;
+ machine_mode mode = VOIDmode;
+
+ switch (d->vmode)
+ {
+ case E_V8QImode: mode = V4HImode; break;
+ case E_V16QImode: mode = V8HImode; break;
+ case E_V32QImode: mode = V16HImode; break;
+ case E_V64QImode: mode = V32HImode; break;
+ case E_V4HImode: mode = V2SImode; break;
+ case E_V8HImode: mode = V4SImode; break;
+ case E_V16HImode: mode = V8SImode; break;
+ case E_V32HImode: mode = V16SImode; break;
+ case E_V4SImode: mode = V2DImode; break;
+ case E_V8SImode: mode = V4DImode; break;
+ case E_V16SImode: mode = V8DImode; break;
+ default: return false;
+ }
+ for (i = 0; i < d->nelt; i += 2)
+ if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
+ return false;
+ nd->vmode = mode;
+ nd->nelt = d->nelt / 2;
+ for (i = 0; i < nd->nelt; i++)
+ nd->perm[i] = d->perm[2 * i] / 2;
+ if (GET_MODE_INNER (mode) != DImode)
+ canonicalize_vector_int_perm (nd, nd);
+ if (nd != d)
+ {
+ nd->one_operand_p = d->one_operand_p;
+ nd->testing_p = d->testing_p;
+ if (d->op0 == d->op1)
+ nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
+ else
+ {
+ nd->op0 = gen_lowpart (nd->vmode, d->op0);
+ nd->op1 = gen_lowpart (nd->vmode, d->op1);
+ }
+ if (d->testing_p)
+ nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
+ else
+ nd->target = gen_reg_rtx (nd->vmode);
+ }
+ return true;
+}
+
/* Return true if permutation D can be performed as VMODE permutation
instead. */
@@ -17190,151 +17850,193 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
unsigned i, nelt, eltsz, mask;
unsigned char perm[64];
machine_mode vmode = V16QImode;
+ struct expand_vec_perm_d nd;
rtx rperm[64], vperm, target, op0, op1;
nelt = d->nelt;
if (!d->one_operand_p)
- {
- if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
- {
- if (TARGET_AVX2
- && valid_perm_using_mode_p (V2TImode, d))
- {
- if (d->testing_p)
- return true;
+ switch (GET_MODE_SIZE (d->vmode))
+ {
+ case 4:
+ if (!TARGET_XOP)
+ return false;
+ vmode = V4QImode;
+ break;
- /* Use vperm2i128 insn. The pattern uses
- V4DImode instead of V2TImode. */
- target = d->target;
- if (d->vmode != V4DImode)
- target = gen_reg_rtx (V4DImode);
- op0 = gen_lowpart (V4DImode, d->op0);
- op1 = gen_lowpart (V4DImode, d->op1);
- rperm[0]
- = GEN_INT ((d->perm[0] / (nelt / 2))
- | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
- emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
- if (target != d->target)
- emit_move_insn (d->target, gen_lowpart (d->vmode, target));
- return true;
- }
+ case 8:
+ if (!TARGET_XOP)
return false;
- }
- }
+ vmode = V8QImode;
+ break;
+
+ case 16:
+ if (!TARGET_XOP)
+ return false;
+ break;
+
+ case 32:
+ if (!TARGET_AVX2)
+ return false;
+
+ if (valid_perm_using_mode_p (V2TImode, d))
+ {
+ if (d->testing_p)
+ return true;
+
+ /* Use vperm2i128 insn. The pattern uses
+ V4DImode instead of V2TImode. */
+ target = d->target;
+ if (d->vmode != V4DImode)
+ target = gen_reg_rtx (V4DImode);
+ op0 = gen_lowpart (V4DImode, d->op0);
+ op1 = gen_lowpart (V4DImode, d->op1);
+ rperm[0]
+ = GEN_INT ((d->perm[0] / (nelt / 2))
+ | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
+ emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
+ if (target != d->target)
+ emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+ return true;
+ }
+ /* FALLTHRU */
+
+ default:
+ return false;
+ }
else
- {
- if (GET_MODE_SIZE (d->vmode) == 16)
- {
- if (!TARGET_SSSE3)
- return false;
- }
- else if (GET_MODE_SIZE (d->vmode) == 32)
- {
- if (!TARGET_AVX2)
- return false;
+ switch (GET_MODE_SIZE (d->vmode))
+ {
+ case 4:
+ if (!TARGET_SSSE3)
+ return false;
+ vmode = V4QImode;
+ break;
- /* V4DImode should be already handled through
- expand_vselect by vpermq instruction. */
- gcc_assert (d->vmode != V4DImode);
+ case 8:
+ if (!TARGET_SSSE3)
+ return false;
+ vmode = V8QImode;
+ break;
- vmode = V32QImode;
- if (d->vmode == V8SImode
- || d->vmode == V16HImode
- || d->vmode == V32QImode)
- {
- /* First see if vpermq can be used for
- V8SImode/V16HImode/V32QImode. */
- if (valid_perm_using_mode_p (V4DImode, d))
- {
- for (i = 0; i < 4; i++)
- perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
- if (d->testing_p)
+ case 16:
+ if (!TARGET_SSSE3)
+ return false;
+ break;
+
+ case 32:
+ if (!TARGET_AVX2)
+ return false;
+
+ /* V4DImode should be already handled through
+ expand_vselect by vpermq instruction. */
+ gcc_assert (d->vmode != V4DImode);
+
+ vmode = V32QImode;
+ if (d->vmode == V8SImode
+ || d->vmode == V16HImode
+ || d->vmode == V32QImode)
+ {
+ /* First see if vpermq can be used for
+ V8SImode/V16HImode/V32QImode. */
+ if (valid_perm_using_mode_p (V4DImode, d))
+ {
+ for (i = 0; i < 4; i++)
+ perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
+ if (d->testing_p)
+ return true;
+ target = gen_reg_rtx (V4DImode);
+ if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
+ perm, 4, false))
+ {
+ emit_move_insn (d->target,
+ gen_lowpart (d->vmode, target));
return true;
- target = gen_reg_rtx (V4DImode);
- if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
- perm, 4, false))
- {
- emit_move_insn (d->target,
- gen_lowpart (d->vmode, target));
- return true;
- }
- return false;
- }
+ }
+ return false;
+ }
- /* Next see if vpermd can be used. */
- if (valid_perm_using_mode_p (V8SImode, d))
- vmode = V8SImode;
- }
- /* Or if vpermps can be used. */
- else if (d->vmode == V8SFmode)
- vmode = V8SImode;
+ /* Next see if vpermd can be used. */
+ if (valid_perm_using_mode_p (V8SImode, d))
+ vmode = V8SImode;
+ }
+ /* Or if vpermps can be used. */
+ else if (d->vmode == V8SFmode)
+ vmode = V8SImode;
- if (vmode == V32QImode)
- {
- /* vpshufb only works intra lanes, it is not
- possible to shuffle bytes in between the lanes. */
- for (i = 0; i < nelt; ++i)
- if ((d->perm[i] ^ i) & (nelt / 2))
- return false;
- }
- }
- else if (GET_MODE_SIZE (d->vmode) == 64)
- {
- if (!TARGET_AVX512BW)
- return false;
+ if (vmode == V32QImode)
+ {
+ /* vpshufb only works intra lanes, it is not
+ possible to shuffle bytes in between the lanes. */
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (nelt / 2))
+ return false;
+ }
+ break;
- /* If vpermq didn't work, vpshufb won't work either. */
- if (d->vmode == V8DFmode || d->vmode == V8DImode)
- return false;
+ case 64:
+ if (!TARGET_AVX512BW)
+ return false;
- vmode = V64QImode;
- if (d->vmode == V16SImode
- || d->vmode == V32HImode
- || d->vmode == V64QImode)
- {
- /* First see if vpermq can be used for
- V16SImode/V32HImode/V64QImode. */
- if (valid_perm_using_mode_p (V8DImode, d))
- {
- for (i = 0; i < 8; i++)
- perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
- if (d->testing_p)
+ /* If vpermq didn't work, vpshufb won't work either. */
+ if (d->vmode == V8DFmode || d->vmode == V8DImode)
+ return false;
+
+ vmode = V64QImode;
+ if (d->vmode == V16SImode
+ || d->vmode == V32HImode
+ || d->vmode == V64QImode)
+ {
+ /* First see if vpermq can be used for
+ V16SImode/V32HImode/V64QImode. */
+ if (valid_perm_using_mode_p (V8DImode, d))
+ {
+ for (i = 0; i < 8; i++)
+ perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
+ if (d->testing_p)
+ return true;
+ target = gen_reg_rtx (V8DImode);
+ if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
+ perm, 8, false))
+ {
+ emit_move_insn (d->target,
+ gen_lowpart (d->vmode, target));
return true;
- target = gen_reg_rtx (V8DImode);
- if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
- perm, 8, false))
- {
- emit_move_insn (d->target,
- gen_lowpart (d->vmode, target));
- return true;
- }
- return false;
- }
+ }
+ return false;
+ }
- /* Next see if vpermd can be used. */
- if (valid_perm_using_mode_p (V16SImode, d))
- vmode = V16SImode;
- }
- /* Or if vpermps can be used. */
- else if (d->vmode == V16SFmode)
- vmode = V16SImode;
- if (vmode == V64QImode)
- {
- /* vpshufb only works intra lanes, it is not
- possible to shuffle bytes in between the lanes. */
- for (i = 0; i < nelt; ++i)
- if ((d->perm[i] ^ i) & (3 * nelt / 4))
- return false;
- }
- }
- else
+ /* Next see if vpermd can be used. */
+ if (valid_perm_using_mode_p (V16SImode, d))
+ vmode = V16SImode;
+ }
+ /* Or if vpermps can be used. */
+ else if (d->vmode == V16SFmode)
+ vmode = V16SImode;
+ if (vmode == V64QImode)
+ {
+ /* vpshufb only works intra lanes, it is not
+ possible to shuffle bytes in between the lanes. */
+ for (i = 0; i < nelt; ++i)
+ if ((d->perm[i] ^ i) & (3 * nelt / 4))
+ return false;
+ }
+ break;
+
+ default:
return false;
- }
+ }
if (d->testing_p)
return true;
+ /* Try to avoid variable permutation instruction. */
+ if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+ {
+ emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
+ return true;
+ }
+
if (vmode == V8SImode)
for (i = 0; i < 8; ++i)
rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
@@ -17346,12 +18048,12 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
eltsz = GET_MODE_UNIT_SIZE (d->vmode);
if (!d->one_operand_p)
mask = 2 * nelt - 1;
- else if (vmode == V16QImode)
- mask = nelt - 1;
else if (vmode == V64QImode)
mask = nelt / 4 - 1;
- else
+ else if (vmode == V32QImode)
mask = nelt / 2 - 1;
+ else
+ mask = nelt - 1;
for (i = 0; i < nelt; ++i)
{
@@ -17361,95 +18063,98 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
}
}
- vperm = gen_rtx_CONST_VECTOR (vmode,
- gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
- vperm = force_reg (vmode, vperm);
+ machine_mode vpmode = vmode;
- target = d->target;
- if (d->vmode != vmode)
+ if (vmode == V4QImode
+ || vmode == V8QImode)
+ {
+ rtx m128 = GEN_INT (-128);
+
+ /* Remap elements from the second operand, as we have to
+ account for inactive top elements from the first operand. */
+ if (!d->one_operand_p)
+ {
+ int sz = GET_MODE_SIZE (vmode);
+
+ for (i = 0; i < nelt; ++i)
+ {
+ int ival = INTVAL (rperm[i]);
+ if (ival >= sz)
+ ival += 16-sz;
+ rperm[i] = GEN_INT (ival);
+ }
+ }
+
+ /* V4QI/V8QI is emulated with V16QI instruction, fill inactive
+ elements in the top positions with zeros. */
+ for (i = nelt; i < 16; ++i)
+ rperm[i] = m128;
+
+ vpmode = V16QImode;
+ }
+
+ vperm = gen_rtx_CONST_VECTOR (vpmode,
+ gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
+ vperm = force_reg (vpmode, vperm);
+
+ if (vmode == d->vmode)
+ target = d->target;
+ else
target = gen_reg_rtx (vmode);
+
op0 = gen_lowpart (vmode, d->op0);
+
if (d->one_operand_p)
{
- if (vmode == V16QImode)
- emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+ rtx (*gen) (rtx, rtx, rtx);
+
+ if (vmode == V4QImode)
+ gen = gen_mmx_pshufbv4qi3;
+ else if (vmode == V8QImode)
+ gen = gen_mmx_pshufbv8qi3;
+ else if (vmode == V16QImode)
+ gen = gen_ssse3_pshufbv16qi3;
else if (vmode == V32QImode)
- emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+ gen = gen_avx2_pshufbv32qi3;
else if (vmode == V64QImode)
- emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
+ gen = gen_avx512bw_pshufbv64qi3;
else if (vmode == V8SFmode)
- emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
+ gen = gen_avx2_permvarv8sf;
else if (vmode == V8SImode)
- emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
+ gen = gen_avx2_permvarv8si;
else if (vmode == V16SFmode)
- emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
+ gen = gen_avx512f_permvarv16sf;
else if (vmode == V16SImode)
- emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
+ gen = gen_avx512f_permvarv16si;
else
gcc_unreachable ();
+
+ emit_insn (gen (target, op0, vperm));
}
else
{
+ rtx (*gen) (rtx, rtx, rtx, rtx);
+
op1 = gen_lowpart (vmode, d->op1);
- emit_insn (gen_xop_pperm (target, op0, op1, vperm));
+
+ if (vmode == V4QImode)
+ gen = gen_mmx_ppermv32;
+ else if (vmode == V8QImode)
+ gen = gen_mmx_ppermv64;
+ else if (vmode == V16QImode)
+ gen = gen_xop_pperm;
+ else
+ gcc_unreachable ();
+
+ emit_insn (gen (target, op0, op1, vperm));
}
+
if (target != d->target)
emit_move_insn (d->target, gen_lowpart (d->vmode, target));
return true;
}
-/* For V*[QHS]Imode permutations, check if the same permutation
- can't be performed in a 2x, 4x or 8x wider inner mode. */
-
-static bool
-canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
- struct expand_vec_perm_d *nd)
-{
- int i;
- machine_mode mode = VOIDmode;
-
- switch (d->vmode)
- {
- case E_V16QImode: mode = V8HImode; break;
- case E_V32QImode: mode = V16HImode; break;
- case E_V64QImode: mode = V32HImode; break;
- case E_V8HImode: mode = V4SImode; break;
- case E_V16HImode: mode = V8SImode; break;
- case E_V32HImode: mode = V16SImode; break;
- case E_V4SImode: mode = V2DImode; break;
- case E_V8SImode: mode = V4DImode; break;
- case E_V16SImode: mode = V8DImode; break;
- default: return false;
- }
- for (i = 0; i < d->nelt; i += 2)
- if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
- return false;
- nd->vmode = mode;
- nd->nelt = d->nelt / 2;
- for (i = 0; i < nd->nelt; i++)
- nd->perm[i] = d->perm[2 * i] / 2;
- if (GET_MODE_INNER (mode) != DImode)
- canonicalize_vector_int_perm (nd, nd);
- if (nd != d)
- {
- nd->one_operand_p = d->one_operand_p;
- nd->testing_p = d->testing_p;
- if (d->op0 == d->op1)
- nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
- else
- {
- nd->op0 = gen_lowpart (nd->vmode, d->op0);
- nd->op1 = gen_lowpart (nd->vmode, d->op1);
- }
- if (d->testing_p)
- nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
- else
- nd->target = gen_reg_rtx (nd->vmode);
- }
- return true;
-}
-
/* Try to expand one-operand permutation with constant mask. */
static bool
@@ -17457,6 +18162,7 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
{
machine_mode mode = GET_MODE (d->op0);
machine_mode maskmode = mode;
+ unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
rtx (*gen) (rtx, rtx, rtx) = NULL;
rtx target, op0, mask;
rtx vec[64];
@@ -17467,6 +18173,18 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
if (!TARGET_AVX512F)
return false;
+ /* Accept VNxHImode and VNxQImode now. */
+ if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
+ return false;
+
+ /* vpermw. */
+ if (!TARGET_AVX512BW && inner_size == 2)
+ return false;
+
+ /* vpermb. */
+ if (!TARGET_AVX512VBMI && inner_size == 1)
+ return false;
+
switch (mode)
{
case E_V16SImode:
@@ -17483,10 +18201,32 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
gen = gen_avx512f_permvarv8df;
maskmode = V8DImode;
break;
+ case E_V32HImode:
+ gen = gen_avx512bw_permvarv32hi;
+ break;
+ case E_V16HImode:
+ gen = gen_avx512vl_permvarv16hi;
+ break;
+ case E_V8HImode:
+ gen = gen_avx512vl_permvarv8hi;
+ break;
+ case E_V64QImode:
+ gen = gen_avx512bw_permvarv64qi;
+ break;
+ case E_V32QImode:
+ gen = gen_avx512vl_permvarv32qi;
+ break;
+ case E_V16QImode:
+ gen = gen_avx512vl_permvarv16qi;
+ break;
+
default:
return false;
}
+ if (d->testing_p)
+ return true;
+
target = d->target;
op0 = d->op0;
for (int i = 0; i < d->nelt; ++i)
@@ -17669,7 +18409,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_palignr (d, true))
return true;
- /* Try the AVX512F vperm{s,d} instructions. */
+ /* Try the AVX512F vperm{w,b,s,d} instructions */
if (ix86_expand_vec_one_operand_perm_avx512 (d))
return true;
@@ -17881,7 +18621,9 @@ expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
;
else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
;
- else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+ else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
+ || GET_MODE_SIZE (vmode) == 8
+ || GET_MODE_SIZE (vmode) == 16))
;
else
return false;
@@ -17960,7 +18702,9 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
rtx_insn *seq;
bool ok, same_halves = false;
- if (GET_MODE_SIZE (d->vmode) == 16)
+ if (GET_MODE_SIZE (d->vmode) == 4
+ || GET_MODE_SIZE (d->vmode) == 8
+ || GET_MODE_SIZE (d->vmode) == 16)
{
if (d->one_operand_p)
return false;
@@ -17995,7 +18739,45 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
memset (remap, 0xff, sizeof (remap));
dremap = *d;
- if (GET_MODE_SIZE (d->vmode) == 16)
+ if (GET_MODE_SIZE (d->vmode) == 4
+ || GET_MODE_SIZE (d->vmode) == 8)
+ {
+ unsigned HOST_WIDE_INT h1, h2, h3, h4;
+
+ /* Split the two input vectors into 4 halves. */
+ h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
+ h2 = h1 << nelt2;
+ h3 = h2 << nelt2;
+ h4 = h3 << nelt2;
+
+ /* If the elements from the low halves use interleave low,
+ and similarly for interleave high. */
+ if ((contents & (h1 | h3)) == contents)
+ {
+ /* punpckl* */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i] = i * 2;
+ remap[i + nelt] = i * 2 + 1;
+ dremap.perm[i * 2] = i;
+ dremap.perm[i * 2 + 1] = i + nelt;
+ }
+ }
+ else if ((contents & (h2 | h4)) == contents)
+ {
+ /* punpckh* */
+ for (i = 0; i < nelt2; ++i)
+ {
+ remap[i + nelt2] = i * 2;
+ remap[i + nelt + nelt2] = i * 2 + 1;
+ dremap.perm[i * 2] = i + nelt2;
+ dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+ }
+ }
+ else
+ return false;
+ }
+ else if (GET_MODE_SIZE (d->vmode) == 16)
{
unsigned HOST_WIDE_INT h1, h2, h3, h4;
@@ -18566,6 +19348,244 @@ expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
return true;
}
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to implement
+ a two vector permutation using two single vector permutations and
+ {,v}{,p}unpckl{ps,pd,bw,wd,dq}. If two_insn, succeed only if one
+ of dfirst or dsecond is identity permutation. */
+
+static bool
+expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
+{
+ unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
+ struct expand_vec_perm_d dfirst, dsecond, dfinal;
+ bool ident1 = true, ident2 = true;
+
+ if (d->one_operand_p)
+ return false;
+
+ if (GET_MODE_SIZE (d->vmode) == 16)
+ {
+ if (!TARGET_SSE)
+ return false;
+ if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
+ return false;
+ }
+ else if (GET_MODE_SIZE (d->vmode) == 32)
+ {
+ if (!TARGET_AVX)
+ return false;
+ if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
+ return false;
+ lane = nelt2;
+ }
+ else
+ return false;
+
+ for (i = 1; i < nelt; i++)
+ if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
+ return false;
+
+ dfirst = *d;
+ dsecond = *d;
+ dfinal = *d;
+ dfirst.op1 = dfirst.op0;
+ dfirst.one_operand_p = true;
+ dsecond.op0 = dsecond.op1;
+ dsecond.one_operand_p = true;
+
+ for (i = 0; i < nelt; i++)
+ if (d->perm[i] >= nelt)
+ {
+ dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
+ if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
+ ident2 = false;
+ dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
+ = d->perm[i] - nelt;
+ }
+ else
+ {
+ dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
+ if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
+ ident1 = false;
+ dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
+ }
+
+ if (two_insn && !ident1 && !ident2)
+ return false;
+
+ if (!d->testing_p)
+ {
+ if (!ident1)
+ dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
+ if (!ident2)
+ dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
+ if (d->perm[0] >= nelt)
+ std::swap (dfinal.op0, dfinal.op1);
+ }
+
+ bool ok;
+ rtx_insn *seq1 = NULL, *seq2 = NULL;
+
+ if (!ident1)
+ {
+ start_sequence ();
+ ok = expand_vec_perm_1 (&dfirst);
+ seq1 = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+ }
+
+ if (!ident2)
+ {
+ start_sequence ();
+ ok = expand_vec_perm_1 (&dsecond);
+ seq2 = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ for (i = 0; i < nelt; i++)
+ {
+ dfinal.perm[i] = i / 2;
+ if (i >= lane)
+ dfinal.perm[i] += lane / 2;
+ if ((i & 1) != 0)
+ dfinal.perm[i] += nelt;
+ }
+ emit_insn (seq1);
+ emit_insn (seq2);
+ ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
+ dfinal.perm, dfinal.nelt, false);
+ gcc_assert (ok);
+ return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_const_1. Try to simplify
+ the permutation using two single vector permutations and the SSE4_1 pblendv
+ instruction. If two_insn, succeed only if one of dfirst or dsecond is
+ identity permutation. */
+
+static bool
+expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
+{
+ unsigned i, nelt = d->nelt;
+ struct expand_vec_perm_d dfirst, dsecond, dfinal;
+ machine_mode vmode = d->vmode;
+ bool ident1 = true, ident2 = true;
+
+ /* Use the same checks as in expand_vec_perm_blend. */
+ if (d->one_operand_p)
+ return false;
+ if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+ ;
+ else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+ ;
+ else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
+ || GET_MODE_SIZE (vmode) == 8
+ || GET_MODE_SIZE (vmode) == 4))
+ ;
+ else
+ return false;
+
+ dfirst = *d;
+ dsecond = *d;
+ dfinal = *d;
+ dfirst.op1 = dfirst.op0;
+ dfirst.one_operand_p = true;
+ dsecond.op0 = dsecond.op1;
+ dsecond.one_operand_p = true;
+
+ for (i = 0; i < nelt; ++i)
+ if (d->perm[i] >= nelt)
+ {
+ dfirst.perm[i] = 0xff;
+ dsecond.perm[i] = d->perm[i] - nelt;
+ if (d->perm[i] != i + nelt)
+ ident2 = false;
+ }
+ else
+ {
+ dsecond.perm[i] = 0xff;
+ dfirst.perm[i] = d->perm[i];
+ if (d->perm[i] != i)
+ ident1 = false;
+ }
+
+ if (two_insn && !ident1 && !ident2)
+ return false;
+
+ /* For now. Ideally treat 0xff as a wildcard. */
+ for (i = 0; i < nelt; ++i)
+ if (dfirst.perm[i] == 0xff)
+ {
+ if (GET_MODE_SIZE (vmode) == 32
+ && dfirst.perm[i ^ (nelt / 2)] != 0xff)
+ dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
+ else
+ dfirst.perm[i] = i;
+ }
+ else
+ {
+ if (GET_MODE_SIZE (vmode) == 32
+ && dsecond.perm[i ^ (nelt / 2)] != 0xff)
+ dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
+ else
+ dsecond.perm[i] = i;
+ }
+
+ if (!d->testing_p)
+ {
+ if (!ident1)
+ dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
+ if (!ident2)
+ dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
+ }
+
+ bool ok;
+ rtx_insn *seq1 = NULL, *seq2 = NULL;
+
+ if (!ident1)
+ {
+ start_sequence ();
+ ok = expand_vec_perm_1 (&dfirst);
+ seq1 = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+ }
+
+ if (!ident2)
+ {
+ start_sequence ();
+ ok = expand_vec_perm_1 (&dsecond);
+ seq2 = get_insns ();
+ end_sequence ();
+
+ if (!ok)
+ return false;
+ }
+
+ if (d->testing_p)
+ return true;
+
+ for (i = 0; i < nelt; ++i)
+ dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
+
+ emit_insn (seq1);
+ emit_insn (seq2);
+ ok = expand_vec_perm_blend (&dfinal);
+ gcc_assert (ok);
+ return true;
+}
+
/* A subroutine of ix86_expand_vec_perm_const_1. Implement a V4DF
permutation using two vperm2f128, followed by a vshufpd insn blending
the two vectors together. */
@@ -18726,14 +19746,36 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
{
rtx rperm[2][16], vperm, l, h, op, m128;
unsigned int i, nelt, eltsz;
+ machine_mode mode;
+ rtx (*gen) (rtx, rtx, rtx);
- if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+ if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
+ && GET_MODE_SIZE (d->vmode) != 8
+ && GET_MODE_SIZE (d->vmode) != 4))
return false;
gcc_assert (!d->one_operand_p);
if (d->testing_p)
return true;
+ switch (GET_MODE_SIZE (d->vmode))
+ {
+ case 4:
+ mode = V4QImode;
+ gen = gen_mmx_pshufbv4qi3;
+ break;
+ case 8:
+ mode = V8QImode;
+ gen = gen_mmx_pshufbv8qi3;
+ break;
+ case 16:
+ mode = V16QImode;
+ gen = gen_ssse3_pshufbv16qi3;
+ break;
+ default:
+ gcc_unreachable ();
+ }
+
nelt = d->nelt;
eltsz = GET_MODE_UNIT_SIZE (d->vmode);
@@ -18744,7 +19786,7 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
m128 = GEN_INT (-128);
for (i = 0; i < nelt; ++i)
{
- unsigned j, e = d->perm[i];
+ unsigned j, k, e = d->perm[i];
unsigned which = (e >= nelt);
if (e >= nelt)
e -= nelt;
@@ -18754,26 +19796,29 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
rperm[1-which][i*eltsz + j] = m128;
}
+
+ for (k = i*eltsz + j; k < 16; ++k)
+ rperm[0][k] = rperm[1][k] = m128;
}
vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
vperm = force_reg (V16QImode, vperm);
- l = gen_reg_rtx (V16QImode);
- op = gen_lowpart (V16QImode, d->op0);
- emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
+ l = gen_reg_rtx (mode);
+ op = gen_lowpart (mode, d->op0);
+ emit_insn (gen (l, op, vperm));
vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
vperm = force_reg (V16QImode, vperm);
- h = gen_reg_rtx (V16QImode);
- op = gen_lowpart (V16QImode, d->op1);
- emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
+ h = gen_reg_rtx (mode);
+ op = gen_lowpart (mode, d->op1);
+ emit_insn (gen (h, op, vperm));
op = d->target;
- if (d->vmode != V16QImode)
- op = gen_reg_rtx (V16QImode);
- emit_insn (gen_iorv16qi3 (op, l, h));
+ if (d->vmode != mode)
+ op = gen_reg_rtx (mode);
+ emit_insn (gen_rtx_SET (op, gen_rtx_IOR (mode, l, h)));
if (op != d->target)
emit_move_insn (d->target, gen_lowpart (d->vmode, op));
@@ -18932,9 +19977,9 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
}
/* A subroutine of expand_vec_perm_even_odd_1. Implement extract-even
- and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
- with two "and" and "pack" or two "shift" and "pack" insns. We should
- have already failed all two instruction sequences. */
+ and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
+ operands with two "and" and "pack" or two "shift" and "pack" insns.
+ We should have already failed all two instruction sequences. */
static bool
expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
@@ -18952,6 +19997,17 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
switch (d->vmode)
{
+ case E_V4HImode:
+ /* Required for "pack". */
+ if (!TARGET_SSE4_1)
+ return false;
+ c = 0xffff;
+ s = 16;
+ half_mode = V2SImode;
+ gen_and = gen_andv2si3;
+ gen_pack = gen_mmx_packusdw;
+ gen_shift = gen_lshrv2si3;
+ break;
case E_V8HImode:
/* Required for "pack". */
if (!TARGET_SSE4_1)
@@ -18963,6 +20019,15 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
gen_pack = gen_sse4_1_packusdw;
gen_shift = gen_lshrv4si3;
break;
+ case E_V8QImode:
+ /* No check as all instructions are SSE2. */
+ c = 0xff;
+ s = 8;
+ half_mode = V4HImode;
+ gen_and = gen_andv4hi3;
+ gen_pack = gen_mmx_packuswb;
+ gen_shift = gen_lshrv4hi3;
+ break;
case E_V16QImode:
/* No check as all instructions are SSE2. */
c = 0xff;
@@ -18995,8 +20060,8 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
end_perm = true;
break;
default:
- /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
- general shuffles. */
+ /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
+ are more profitable than general shuffles. */
return false;
}
@@ -19174,6 +20239,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
case E_V2DImode:
case E_V2SImode:
case E_V4SImode:
+ case E_V2HImode:
/* These are always directly implementable by expand_vec_perm_1. */
gcc_unreachable ();
@@ -19184,19 +20250,46 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
return false;
break;
+ case E_V4QImode:
+ if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+ return expand_vec_perm_pshufb2 (d);
+ else
+ {
+ if (d->testing_p)
+ break;
+ /* We need 2*log2(N)-1 operations to achieve odd/even
+ with interleave. */
+ t1 = gen_reg_rtx (V4QImode);
+ emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
+ emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
+ if (odd)
+ t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
+ else
+ t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
+ emit_insn (t2);
+ }
+ break;
+
case E_V4HImode:
- if (d->testing_p)
- break;
- /* We need 2*log2(N)-1 operations to achieve odd/even
- with interleave. */
- t1 = gen_reg_rtx (V4HImode);
- emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
- emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
- if (odd)
- t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
+ if (TARGET_SSE4_1)
+ return expand_vec_perm_even_odd_pack (d);
+ else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+ return expand_vec_perm_pshufb2 (d);
else
- t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
- emit_insn (t2);
+ {
+ if (d->testing_p)
+ break;
+ /* We need 2*log2(N)-1 operations to achieve odd/even
+ with interleave. */
+ t1 = gen_reg_rtx (V4HImode);
+ emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
+ emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
+ if (odd)
+ t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
+ else
+ t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
+ emit_insn (t2);
+ }
break;
case E_V8HImode:
@@ -19224,6 +20317,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
}
break;
+ case E_V8QImode:
case E_V16QImode:
return expand_vec_perm_even_odd_pack (d);
@@ -19354,6 +20448,11 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
if (d->perm[i] != 2 * i + odd)
return false;
+ if (d->vmode == E_V32HImode
+ && d->testing_p
+ && !TARGET_AVX512BW)
+ return false;
+
return expand_vec_perm_even_odd_1 (d, odd);
}
@@ -19365,6 +20464,7 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
{
unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
machine_mode vmode = d->vmode;
+ rtx (*gen) (rtx, rtx, rtx);
unsigned char perm2[4];
rtx op0 = d->op0, dest;
bool ok;
@@ -19384,9 +20484,70 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
case E_V2DImode:
case E_V2SImode:
case E_V4SImode:
+ case E_V2HImode:
+ case E_V4HImode:
/* These are always implementable using standard shuffle patterns. */
gcc_unreachable ();
+ case E_V4QImode:
+ /* This can be implemented via interleave and pshuflw. */
+ if (d->testing_p)
+ return true;
+
+ if (elt >= nelt2)
+ {
+ gen = gen_mmx_punpckhbw_low;
+ elt -= nelt2;
+ }
+ else
+ gen = gen_mmx_punpcklbw_low;
+
+ dest = gen_reg_rtx (vmode);
+ emit_insn (gen (dest, op0, op0));
+ vmode = get_mode_wider_vector (vmode);
+ op0 = gen_lowpart (vmode, dest);
+
+ memset (perm2, elt, 2);
+ dest = gen_reg_rtx (vmode);
+ ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
+ gcc_assert (ok);
+
+ emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+ return true;
+
+ case E_V8QImode:
+ /* This can be implemented via interleave. We save one insn by
+ stopping once we have promoted to V2SImode and then use pshufd. */
+ if (d->testing_p)
+ return true;
+ do
+ {
+ if (elt >= nelt2)
+ {
+ gen = vmode == V8QImode ? gen_mmx_punpckhbw
+ : gen_mmx_punpckhwd;
+ elt -= nelt2;
+ }
+ else
+ gen = vmode == V8QImode ? gen_mmx_punpcklbw
+ : gen_mmx_punpcklwd;
+ nelt2 /= 2;
+
+ dest = gen_reg_rtx (vmode);
+ emit_insn (gen (dest, op0, op0));
+ vmode = get_mode_wider_vector (vmode);
+ op0 = gen_lowpart (vmode, dest);
+ }
+ while (vmode != V2SImode);
+
+ memset (perm2, elt, 2);
+ dest = gen_reg_rtx (vmode);
+ ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
+ gcc_assert (ok);
+
+ emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+ return true;
+
case E_V8HImode:
case E_V16QImode:
/* These can be implemented via interleave. We save one insn by
@@ -19395,17 +20556,15 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
return true;
do
{
- rtx dest;
- rtx (*gen) (rtx, rtx, rtx)
- = vmode == V16QImode ? gen_vec_interleave_lowv16qi
- : gen_vec_interleave_lowv8hi;
-
if (elt >= nelt2)
{
gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
: gen_vec_interleave_highv8hi;
elt -= nelt2;
}
+ else
+ gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
+ : gen_vec_interleave_lowv8hi;
nelt2 /= 2;
dest = gen_reg_rtx (vmode);
@@ -19416,14 +20575,13 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
while (vmode != V4SImode);
memset (perm2, elt, 4);
- dest = gen_reg_rtx (V4SImode);
+ dest = gen_reg_rtx (vmode);
ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
gcc_assert (ok);
- if (!d->testing_p)
- emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+
+ emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
return true;
- case E_V64QImode:
case E_V32QImode:
case E_V16HImode:
case E_V8SImode:
@@ -19433,6 +20591,14 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
gcc_assert (!TARGET_AVX2 || d->perm[0]);
return false;
+ case E_V64QImode:
+ gcc_assert (!TARGET_AVX512BW || d->perm[0]);
+ return false;
+
+ case E_V32HImode:
+ gcc_assert (!TARGET_AVX512BW);
+ return false;
+
default:
gcc_unreachable ();
}
@@ -19677,6 +20843,12 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_pblendv (d))
return true;
+ if (expand_vec_perm_2perm_interleave (d, true))
+ return true;
+
+ if (expand_vec_perm_2perm_pblendv (d, true))
+ return true;
+
/* Try sequences of three instructions. */
if (expand_vec_perm_even_odd_pack (d))
@@ -19694,6 +20866,12 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
if (expand_vec_perm_vperm2f128_vblend (d))
return true;
+ if (expand_vec_perm_2perm_interleave (d, false))
+ return true;
+
+ if (expand_vec_perm_2perm_pblendv (d, false))
+ return true;
+
/* Try sequences of four instructions. */
if (expand_vec_perm_even_odd_trunc (d))
@@ -19822,16 +21000,16 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
return true;
break;
case E_V32HImode:
- if (!TARGET_AVX512BW)
+ if (!TARGET_AVX512F)
return false;
- if (d.testing_p)
+ if (d.testing_p && TARGET_AVX512BW)
/* All implementable with a single vperm[it]2 insn. */
return true;
break;
case E_V64QImode:
- if (!TARGET_AVX512BW)
+ if (!TARGET_AVX512F)
return false;
- if (d.testing_p)
+ if (d.testing_p && TARGET_AVX512BW)
/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn. */
return true;
break;
@@ -19878,9 +21056,21 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
case E_V2SFmode:
case E_V2SImode:
case E_V4HImode:
+ case E_V8QImode:
if (!TARGET_MMX_WITH_SSE)
return false;
break;
+ case E_V2HImode:
+ if (!TARGET_SSE2)
+ return false;
+ /* All implementable with *punpckwd. */
+ if (d.testing_p)
+ return true;
+ break;
+ case E_V4QImode:
+ if (!TARGET_SSE2)
+ return false;
+ break;
case E_V2DImode:
case E_V2DFmode:
if (!TARGET_SSE)
@@ -19912,10 +21102,11 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
/* Check whether the mask can be applied to the vector type. */
d.one_operand_p = (which != 3);
- /* Implementable with shufps or pshufd. */
+ /* Implementable with shufps, pshufd or pshuflw. */
if (d.one_operand_p
&& (d.vmode == V4SFmode || d.vmode == V2SFmode
- || d.vmode == V4SImode || d.vmode == V2SImode))
+ || d.vmode == V4SImode || d.vmode == V2SImode
+ || d.vmode == V4HImode || d.vmode == V2HImode))
return true;
/* Otherwise we have to go through the motions and see if we can
@@ -19934,6 +21125,32 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
two_args = canonicalize_perm (&d);
+ /* If one of the operands is a zero vector, try to match pmovzx. */
+ if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
+ {
+ struct expand_vec_perm_d dzero = d;
+ if (d.op0 == CONST0_RTX (vmode))
+ {
+ d.op1 = dzero.op1 = force_reg (vmode, d.op1);
+ std::swap (dzero.op0, dzero.op1);
+ for (i = 0; i < nelt; ++i)
+ dzero.perm[i] ^= nelt;
+ }
+ else
+ d.op0 = dzero.op0 = force_reg (vmode, d.op0);
+
+ if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
+ dzero.perm, nelt, dzero.testing_p))
+ return true;
+ }
+
+ /* Force operands into registers. */
+ rtx nop0 = force_reg (vmode, d.op0);
+ if (d.op0 == d.op1)
+ d.op1 = nop0;
+ d.op0 = nop0;
+ d.op1 = force_reg (vmode, d.op1);
+
if (ix86_expand_vec_perm_const_1 (&d))
return true;
@@ -20003,8 +21220,9 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
gcc_assert (ok);
}
-/* Optimize vector MUL generation for V8QI, V16QI and V32QI
- under TARGET_AVX512BW. i.e. for v16qi a * b, it has
+/* This function is similar as ix86_expand_vecop_qihi,
+ but optimized under AVX512BW by using vpmovwb.
+ For example, optimize vector MUL generation like
vpmovzxbw ymm2, xmm0
vpmovzxbw ymm3, xmm1
@@ -20014,13 +21232,14 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
it would take less instructions than ix86_expand_vecop_qihi.
Return true if success. */
-bool
-ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
+static bool
+ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
{
machine_mode himode, qimode = GET_MODE (dest);
rtx hop1, hop2, hdest;
rtx (*gen_extend)(rtx, rtx);
rtx (*gen_truncate)(rtx, rtx);
+ bool uns_p = (code == ASHIFTRT) ? false : true;
/* There's no V64HImode multiplication instruction. */
if (qimode == E_V64QImode)
@@ -20041,17 +21260,17 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
{
case E_V8QImode:
himode = V8HImode;
- gen_extend = gen_zero_extendv8qiv8hi2;
+ gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
gen_truncate = gen_truncv8hiv8qi2;
break;
case E_V16QImode:
himode = V16HImode;
- gen_extend = gen_zero_extendv16qiv16hi2;
+ gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
gen_truncate = gen_truncv16hiv16qi2;
break;
case E_V32QImode:
himode = V32HImode;
- gen_extend = gen_zero_extendv32qiv32hi2;
+ gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
gen_truncate = gen_truncv32hiv32qi2;
break;
default:
@@ -20063,7 +21282,7 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
hdest = gen_reg_rtx (himode);
emit_insn (gen_extend (hop1, op1));
emit_insn (gen_extend (hop2, op2));
- emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
+ emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
hop1, hop2)));
emit_insn (gen_truncate (dest, hdest));
return true;
@@ -20071,8 +21290,9 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
/* Expand a vector operation shift by constant for a V*QImode in terms of the
same operation on V*HImode. Return true if success. */
-bool
-ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+static bool
+ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
+ rtx dest, rtx op1, rtx op2)
{
machine_mode qimode, himode;
HOST_WIDE_INT and_constant, xor_constant;
@@ -20184,6 +21404,16 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
bool uns_p = false;
int i;
+ if (CONST_INT_P (op2)
+ && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
+ && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
+ return;
+
+ if (TARGET_AVX512BW
+ && VECTOR_MODE_P (GET_MODE (op2))
+ && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
+ return;
+
switch (qimode)
{
case E_V16QImode:
@@ -20205,7 +21435,6 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
gcc_unreachable ();
}
- op2_l = op2_h = op2;
switch (code)
{
case MULT:
@@ -20234,17 +21463,46 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
op1_h = gen_reg_rtx (himode);
ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
+ /* vashr/vlshr/vashl */
+ if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+ {
+ rtx tmp = force_reg (qimode, op2);
+ op2_l = gen_reg_rtx (himode);
+ op2_h = gen_reg_rtx (himode);
+ ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
+ ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
+ }
+ else
+ op2_l = op2_h = op2;
+
full_interleave = true;
break;
default:
gcc_unreachable ();
}
- /* Perform the operation. */
- res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
- 1, OPTAB_DIRECT);
- res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
- 1, OPTAB_DIRECT);
+ /* Perform vashr/vlshr/vashl. */
+ if (code != MULT
+ && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+ {
+ res_l = gen_reg_rtx (himode);
+ res_h = gen_reg_rtx (himode);
+ emit_insn (gen_rtx_SET (res_l,
+ simplify_gen_binary (code, himode,
+ op1_l, op2_l)));
+ emit_insn (gen_rtx_SET (res_h,
+ simplify_gen_binary (code, himode,
+ op1_h, op2_h)));
+ }
+ /* Performance mult/ashr/lshr/ashl. */
+ else
+ {
+ res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
+ 1, OPTAB_DIRECT);
+ res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
+ 1, OPTAB_DIRECT);
+ }
+
gcc_assert (res_l && res_h);
/* Merge the data back into the right place. */