From b18a97e5dd0935e1c4a626c230f21457d0aad3d5 Mon Sep 17 00:00:00 2001
From: Thomas Koenig <tkoenig@gcc.gnu.org>
Date: Mon, 13 Sep 2021 19:49:49 +0200
Subject: Merged current trunk to branch.

---
 gcc/config/i386/i386-expand.c | 2292 +++++++++++++++++++++++++++++++----------
 1 file changed, 1775 insertions(+), 517 deletions(-)

(limited to 'gcc/config/i386/i386-expand.c')

diff --git a/gcc/config/i386/i386-expand.c b/gcc/config/i386/i386-expand.c
index 6e08fd32726..e117afb16b8 100644
--- a/gcc/config/i386/i386-expand.c
+++ b/gcc/config/i386/i386-expand.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 1988-2020 Free Software Foundation, Inc.
+/* Copyright (C) 1988-2021 Free Software Foundation, Inc.
 
 This file is part of GCC.
 
@@ -154,9 +154,13 @@ split_double_mode (machine_mode mode, rtx operands[],
 	  lo_half[num] = simplify_gen_subreg (half_mode, op,
 					      GET_MODE (op) == VOIDmode
 					      ? mode : GET_MODE (op), 0);
-	  hi_half[num] = simplify_gen_subreg (half_mode, op,
-					      GET_MODE (op) == VOIDmode
-					      ? mode : GET_MODE (op), byte);
+
+	  rtx tmp = simplify_gen_subreg (half_mode, op,
+					 GET_MODE (op) == VOIDmode
+					 ? mode : GET_MODE (op), byte);
+	  /* simplify_gen_subreg will return NULL RTX for the
+	     high half of the paradoxical subreg. */
+	  hi_half[num] = tmp ? tmp : gen_reg_rtx (half_mode);
 	}
     }
 }
@@ -186,6 +190,82 @@ ix86_expand_clear (rtx dest)
   emit_insn (tmp);
 }
 
+/* Return true if V can be broadcasted from an integer of WIDTH bits
+   which is returned in VAL_BROADCAST.  Otherwise, return false.  */
+
+static bool
+ix86_broadcast (HOST_WIDE_INT v, unsigned int width,
+		HOST_WIDE_INT &val_broadcast)
+{
+  wide_int val = wi::uhwi (v, HOST_BITS_PER_WIDE_INT);
+  val_broadcast = wi::extract_uhwi (val, 0, width);
+  for (unsigned int i = width; i < HOST_BITS_PER_WIDE_INT; i += width)
+    {
+      HOST_WIDE_INT each = wi::extract_uhwi (val, i, width);
+      if (val_broadcast != each)
+	return false;
+    }
+  val_broadcast = sext_hwi (val_broadcast, width);
+  return true;
+}
+
+/* Convert the CONST_WIDE_INT operand OP to broadcast in MODE.  */
+
+static rtx
+ix86_convert_const_wide_int_to_broadcast (machine_mode mode, rtx op)
+{
+  /* Don't use integer vector broadcast if we can't move from GPR to SSE
+     register directly.  */
+  if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
+    return nullptr;
+
+  /* Convert CONST_WIDE_INT to a non-standard SSE constant integer
+     broadcast only if vector broadcast is available.  */
+  if (!TARGET_AVX
+      || !CONST_WIDE_INT_P (op)
+      || standard_sse_constant_p (op, mode))
+    return nullptr;
+
+  HOST_WIDE_INT val = CONST_WIDE_INT_ELT (op, 0);
+  HOST_WIDE_INT val_broadcast;
+  scalar_int_mode broadcast_mode;
+  if (TARGET_AVX2
+      && ix86_broadcast (val, GET_MODE_BITSIZE (QImode),
+			 val_broadcast))
+    broadcast_mode = QImode;
+  else if (TARGET_AVX2
+	   && ix86_broadcast (val, GET_MODE_BITSIZE (HImode),
+			      val_broadcast))
+    broadcast_mode = HImode;
+  else if (ix86_broadcast (val, GET_MODE_BITSIZE (SImode),
+			   val_broadcast))
+    broadcast_mode = SImode;
+  else if (TARGET_64BIT
+	   && ix86_broadcast (val, GET_MODE_BITSIZE (DImode),
+			      val_broadcast))
+    broadcast_mode = DImode;
+  else
+    return nullptr;
+
+  /* Check if OP can be broadcasted from VAL.  */
+  for (int i = 1; i < CONST_WIDE_INT_NUNITS (op); i++)
+    if (val != CONST_WIDE_INT_ELT (op, i))
+      return nullptr;
+
+  unsigned int nunits = (GET_MODE_SIZE (mode)
+			 / GET_MODE_SIZE (broadcast_mode));
+  machine_mode vector_mode;
+  if (!mode_for_vector (broadcast_mode, nunits).exists (&vector_mode))
+    gcc_unreachable ();
+  rtx target = ix86_gen_scratch_sse_rtx (vector_mode);
+  bool ok = ix86_expand_vector_init_duplicate (false, vector_mode,
+					       target,
+					       GEN_INT (val_broadcast));
+  gcc_assert (ok);
+  target = lowpart_subreg (mode, target, vector_mode);
+  return target;
+}
+
 void
 ix86_expand_move (machine_mode mode, rtx operands[])
 {
@@ -343,20 +423,29 @@ ix86_expand_move (machine_mode mode, rtx operands[])
 	  && optimize)
 	op1 = copy_to_mode_reg (mode, op1);
 
-      if (can_create_pseudo_p ()
-	  && CONST_DOUBLE_P (op1))
+      if (can_create_pseudo_p ())
 	{
-	  /* If we are loading a floating point constant to a register,
-	     force the value to memory now, since we'll get better code
-	     out the back end.  */
+	  if (CONST_DOUBLE_P (op1))
+	    {
+	      /* If we are loading a floating point constant to a
+		 register, force the value to memory now, since we'll
+		 get better code out the back end.  */
 
-	  op1 = validize_mem (force_const_mem (mode, op1));
-	  if (!register_operand (op0, mode))
+	      op1 = validize_mem (force_const_mem (mode, op1));
+	      if (!register_operand (op0, mode))
+		{
+		  rtx temp = gen_reg_rtx (mode);
+		  emit_insn (gen_rtx_SET (temp, op1));
+		  emit_move_insn (op0, temp);
+		  return;
+		}
+	    }
+	  else if (GET_MODE_SIZE (mode) >= 16)
 	    {
-	      rtx temp = gen_reg_rtx (mode);
-	      emit_insn (gen_rtx_SET (temp, op1));
-	      emit_move_insn (op0, temp);
-	      return;
+	      rtx tmp = ix86_convert_const_wide_int_to_broadcast
+		(GET_MODE (op0), op1);
+	      if (tmp != nullptr)
+		op1 = tmp;
 	    }
 	}
     }
@@ -364,6 +453,70 @@ ix86_expand_move (machine_mode mode, rtx operands[])
   emit_insn (gen_rtx_SET (op0, op1));
 }
 
+/* OP is a memref of CONST_VECTOR, return scalar constant mem
+   if CONST_VECTOR is a vec_duplicate, else return NULL.  */
+static rtx
+ix86_broadcast_from_constant (machine_mode mode, rtx op)
+{
+  int nunits = GET_MODE_NUNITS (mode);
+  if (nunits < 2)
+    return nullptr;
+
+  /* Don't use integer vector broadcast if we can't move from GPR to SSE
+     register directly.  */
+  if (!TARGET_INTER_UNIT_MOVES_TO_VEC
+      && INTEGRAL_MODE_P (mode))
+    return nullptr;
+
+  /* Convert CONST_VECTOR to a non-standard SSE constant integer
+     broadcast only if vector broadcast is available.  */
+  if (!(TARGET_AVX2
+	|| (TARGET_AVX
+	    && (GET_MODE_INNER (mode) == SImode
+		|| GET_MODE_INNER (mode) == DImode))
+	|| FLOAT_MODE_P (mode))
+      || standard_sse_constant_p (op, mode))
+    return nullptr;
+
+  /* Don't broadcast from a 64-bit integer constant in 32-bit mode.
+     We can still put 64-bit integer constant in memory when
+     avx512 embed broadcast is available.  */
+  if (GET_MODE_INNER (mode) == DImode && !TARGET_64BIT
+      && (!TARGET_AVX512F
+	  || (GET_MODE_SIZE (mode) < 64 && !TARGET_AVX512VL)))
+    return nullptr;
+
+  if (GET_MODE_INNER (mode) == TImode)
+    return nullptr;
+
+  rtx constant = get_pool_constant (XEXP (op, 0));
+  if (GET_CODE (constant) != CONST_VECTOR)
+    return nullptr;
+
+  /* There could be some rtx like
+     (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+     but with "*.LC1" refer to V2DI constant vector.  */
+  if (GET_MODE (constant) != mode)
+    {
+      constant = simplify_subreg (mode, constant, GET_MODE (constant),
+				  0);
+      if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
+	return nullptr;
+    }
+
+  rtx first = XVECEXP (constant, 0, 0);
+
+  for (int i = 1; i < nunits; ++i)
+    {
+      rtx tmp = XVECEXP (constant, 0, i);
+      /* Vector duplicate value.  */
+      if (!rtx_equal_p (tmp, first))
+	return nullptr;
+    }
+
+  return first;
+}
+
 void
 ix86_expand_vector_move (machine_mode mode, rtx operands[])
 {
@@ -403,7 +556,39 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
 	  op1 = simplify_gen_subreg (mode, r, imode, SUBREG_BYTE (op1));
 	}
       else
-	op1 = validize_mem (force_const_mem (mode, op1));
+	{
+	  machine_mode mode = GET_MODE (op0);
+	  rtx tmp = ix86_convert_const_wide_int_to_broadcast
+	    (mode, op1);
+	  if (tmp == nullptr)
+	    op1 = validize_mem (force_const_mem (mode, op1));
+	  else
+	    op1 = tmp;
+	}
+    }
+
+  if (can_create_pseudo_p ()
+      && GET_MODE_SIZE (mode) >= 16
+      && VECTOR_MODE_P (mode)
+      && (MEM_P (op1)
+	  && SYMBOL_REF_P (XEXP (op1, 0))
+	  && CONSTANT_POOL_ADDRESS_P (XEXP (op1, 0))))
+    {
+      rtx first = ix86_broadcast_from_constant (mode, op1);
+      if (first != nullptr)
+	{
+	  /* Broadcast to XMM/YMM/ZMM register from an integer
+	     constant or scalar mem.  */
+	  op1 = gen_reg_rtx (mode);
+	  if (FLOAT_MODE_P (mode)
+	      || (!TARGET_64BIT && GET_MODE_INNER (mode) == DImode))
+	    first = force_const_mem (GET_MODE_INNER (mode), first);
+	  bool ok = ix86_expand_vector_init_duplicate (false, mode,
+						       op1, first);
+	  gcc_assert (ok);
+	  emit_move_insn (op0, op1);
+	  return;
+	}
     }
 
   /* We need to check memory alignment for SSE mode since attribute
@@ -419,7 +604,11 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
 	 arguments in memory.  */
       if (!register_operand (op0, mode)
 	  && !register_operand (op1, mode))
-	op1 = force_reg (mode, op1);
+	{
+	  rtx scratch = ix86_gen_scratch_sse_rtx (mode);
+	  emit_move_insn (scratch, op1);
+	  op1 = scratch;
+	}
 
       tmp[0] = op0; tmp[1] = op1;
       ix86_expand_vector_move_misalign (mode, tmp);
@@ -431,7 +620,9 @@ ix86_expand_vector_move (machine_mode mode, rtx operands[])
       && !register_operand (op0, mode)
       && !register_operand (op1, mode))
     {
-      emit_move_insn (op0, force_reg (GET_MODE (op0), op1));
+      rtx tmp = ix86_gen_scratch_sse_rtx (GET_MODE (op0));
+      emit_move_insn (tmp, op1);
+      emit_move_insn (op0, tmp);
       return;
     }
 
@@ -487,6 +678,10 @@ ix86_avx256_split_vector_move_misalign (rtx op0, rtx op1)
       extract = gen_avx_vextractf128v32qi;
       mode = V16QImode;
       break;
+    case E_V16HFmode:
+      extract = gen_avx_vextractf128v16hf;
+      mode = V8HFmode;
+      break;
     case E_V8SFmode:
       extract = gen_avx_vextractf128v8sf;
       mode = V4SFmode;
@@ -759,6 +954,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
 
   switch (mode)
     {
+    case E_V4QImode:
     case E_V8QImode:
       sse_mode = V16QImode;
       double_sse_mode = V32QImode;
@@ -775,6 +971,7 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
       break;
 
     case E_V4HImode:
+    case E_V2HImode:
       sse_mode = V8HImode;
       double_sse_mode = V16HImode;
       mask = gen_rtx_PARALLEL (VOIDmode,
@@ -794,6 +991,15 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
 					  GEN_INT (1), GEN_INT (5)));
       break;
 
+    case E_V2SFmode:
+      sse_mode = V4SFmode;
+      double_sse_mode = V8SFmode;
+      mask = gen_rtx_PARALLEL (VOIDmode,
+			       gen_rtvec (4,
+					  GEN_INT (0), GEN_INT (4),
+					  GEN_INT (1), GEN_INT (5)));
+      break;
+
     default:
       gcc_unreachable ();
     }
@@ -808,14 +1014,36 @@ ix86_split_mmx_punpck (rtx operands[], bool high_p)
   rtx insn = gen_rtx_SET (dest, op2);
   emit_insn (insn);
 
+  /* Move high bits to low bits.  */
   if (high_p)
     {
-      /* Move bits 64:127 to bits 0:63.  */
-      mask = gen_rtx_PARALLEL (VOIDmode,
-			       gen_rtvec (4, GEN_INT (2), GEN_INT (3),
-					  GEN_INT (0), GEN_INT (0)));
-      dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
-      op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
+      if (sse_mode == V4SFmode)
+	{
+	  mask = gen_rtx_PARALLEL (VOIDmode,
+				   gen_rtvec (4, GEN_INT (2), GEN_INT (3),
+					      GEN_INT (4), GEN_INT (5)));
+	  op2 = gen_rtx_VEC_CONCAT (V8SFmode, dest, dest);
+	  op1 = gen_rtx_VEC_SELECT (V4SFmode, op2, mask);
+	}
+      else
+	{
+	  int sz = GET_MODE_SIZE (mode);
+
+	  if (sz == 4)
+	    mask = gen_rtx_PARALLEL (VOIDmode,
+				     gen_rtvec (4, GEN_INT (1), GEN_INT (0),
+						GEN_INT (0), GEN_INT (1)));
+	  else if (sz == 8)
+	    mask = gen_rtx_PARALLEL (VOIDmode,
+				     gen_rtvec (4, GEN_INT (2), GEN_INT (3),
+						GEN_INT (0), GEN_INT (1)));
+	  else
+	    gcc_unreachable ();
+
+	  dest = lowpart_subreg (V4SImode, dest, GET_MODE (dest));
+	  op1 = gen_rtx_VEC_SELECT (V4SImode, dest, mask);
+	}
+
       insn = gen_rtx_SET (dest, op1);
       emit_insn (insn);
     }
@@ -1291,6 +1519,9 @@ find_nearest_reg_def (rtx_insn *insn, int regno1, int regno2)
   return false;
 }
 
+/* INSN_UID of the last insn emitted by zero store peephole2s.  */
+int ix86_last_zero_store_uid;
+
 /* Split lea instructions into a sequence of instructions
    which are executed on ALU to avoid AGU stalls.
    It is assumed that it is allowed to clobber flags register
@@ -1348,9 +1579,10 @@ ix86_split_lea_for_addr (rtx_insn *insn, rtx operands[], machine_mode mode)
 	  if (regno0 != regno2)
 	    emit_insn (gen_rtx_SET (target, parts.index));
 
-	  /* Use shift for scaling.  */
-	  ix86_emit_binop (ASHIFT, mode, target,
-			   GEN_INT (exact_log2 (parts.scale)));
+	  /* Use shift for scaling, but emit it as MULT instead
+	     to avoid it being immediately peephole2 optimized back
+	     into lea.  */
+	  ix86_emit_binop (MULT, mode, target, GEN_INT (parts.scale));
 
 	  if (parts.base)
 	    ix86_emit_binop (PLUS, mode, target, parts.base);
@@ -1549,6 +1781,8 @@ ix86_expand_convert_uns_sixf_sse (rtx, rtx)
   gcc_unreachable ();
 }
 
+static rtx ix86_expand_sse_fabs (rtx op0, rtx *smask);
+
 /* Convert an unsigned SImode value into a DFmode.  Only currently used
    for SSE, but applicable anywhere.  */
 
@@ -1568,6 +1802,11 @@ ix86_expand_convert_uns_sidf_sse (rtx target, rtx input)
   x = const_double_from_real_value (TWO31r, DFmode);
 
   x = expand_simple_binop (DFmode, PLUS, fp, x, target, 0, OPTAB_DIRECT);
+
+  /* Remove the sign with FE_DOWNWARD, where x - x = -0.0.  */
+  if (HONOR_SIGNED_ZEROS (DFmode) && flag_rounding_math)
+    x = ix86_expand_sse_fabs (x, NULL);
+
   if (x != target)
     emit_move_insn (target, x);
 }
@@ -1616,12 +1855,21 @@ ix86_expand_convert_uns_sisf_sse (rtx target, rtx input)
   fp_lo = gen_reg_rtx (SFmode);
   emit_insn (gen_floatsisf2 (fp_hi, int_hi));
   emit_insn (gen_floatsisf2 (fp_lo, int_lo));
-  fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
-			       0, OPTAB_DIRECT);
-  fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
-			       0, OPTAB_DIRECT);
-  if (!rtx_equal_p (target, fp_hi))
-    emit_move_insn (target, fp_hi);
+  if (TARGET_FMA)
+    {
+      x = validize_mem (force_const_mem (SFmode, x));
+      fp_hi = gen_rtx_FMA (SFmode, fp_hi, x, fp_lo);
+      emit_move_insn (target, fp_hi);
+    }
+  else
+    {
+      fp_hi = expand_simple_binop (SFmode, MULT, fp_hi, x, fp_hi,
+				   0, OPTAB_DIRECT);
+      fp_hi = expand_simple_binop (SFmode, PLUS, fp_hi, fp_lo, target,
+				   0, OPTAB_DIRECT);
+      if (!rtx_equal_p (target, fp_hi))
+	emit_move_insn (target, fp_hi);
+    }
 }
 
 /* floatunsv{4,8}siv{4,8}sf2 expander.  Expand code to convert
@@ -1653,12 +1901,20 @@ ix86_expand_vector_convert_uns_vsivsf (rtx target, rtx val)
   real_ldexp (&TWO16r, &dconst1, 16);
   tmp[5] = const_double_from_real_value (TWO16r, SFmode);
   tmp[5] = force_reg (fltmode, ix86_build_const_vector (fltmode, 1, tmp[5]));
-  tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5], NULL_RTX, 1,
-				OPTAB_DIRECT);
-  tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6], target, 1,
-				OPTAB_DIRECT);
-  if (tmp[7] != target)
-    emit_move_insn (target, tmp[7]);
+  if (TARGET_FMA)
+    {
+      tmp[6] = gen_rtx_FMA (fltmode, tmp[4], tmp[5], tmp[3]);
+      emit_move_insn (target, tmp[6]);
+    }
+  else
+    {
+      tmp[6] = expand_simple_binop (fltmode, MULT, tmp[4], tmp[5],
+				    NULL_RTX, 1, OPTAB_DIRECT);
+      tmp[7] = expand_simple_binop (fltmode, PLUS, tmp[3], tmp[6],
+				    target, 1, OPTAB_DIRECT);
+      if (tmp[7] != target)
+	emit_move_insn (target, tmp[7]);
+    }
 }
 
 /* Adjust a V*SFmode/V*DFmode value VAL so that *sfix_trunc* resp. fix_trunc*
@@ -1863,13 +2119,9 @@ void
 ix86_expand_copysign (rtx operands[])
 {
   machine_mode mode, vmode;
-  rtx dest, op0, op1, mask;
-
-  dest = operands[0];
-  op0 = operands[1];
-  op1 = operands[2];
+  rtx dest, op0, op1, mask, op2, op3;
 
-  mode = GET_MODE (dest);
+  mode = GET_MODE (operands[0]);
 
   if (mode == SFmode)
     vmode = V4SFmode;
@@ -1880,136 +2132,40 @@ ix86_expand_copysign (rtx operands[])
   else
     gcc_unreachable ();
 
-  mask = ix86_build_signbit_mask (vmode, 0, 0);
-
-  if (CONST_DOUBLE_P (op0))
-    {
-      if (real_isneg (CONST_DOUBLE_REAL_VALUE (op0)))
-	op0 = simplify_unary_operation (ABS, mode, op0, mode);
-
-      if (mode == SFmode || mode == DFmode)
-	{
-	  if (op0 == CONST0_RTX (mode))
-	    op0 = CONST0_RTX (vmode);
-	  else
-	    {
-	      rtx v = ix86_build_const_vector (vmode, false, op0);
-
-	      op0 = force_reg (vmode, v);
-	    }
-	}
-      else if (op0 != CONST0_RTX (mode))
-	op0 = force_reg (mode, op0);
-
-      emit_insn (gen_copysign3_const (mode, dest, op0, op1, mask));
-    }
-  else
-    {
-      rtx nmask = ix86_build_signbit_mask (vmode, 0, 1);
-
-      emit_insn (gen_copysign3_var
-		 (mode, dest, NULL_RTX, op0, op1, nmask, mask));
-    }
-}
-
-/* Deconstruct a copysign operation into bit masks.  Operand 0 is known to
-   be a constant, and so has already been expanded into a vector constant.  */
-
-void
-ix86_split_copysign_const (rtx operands[])
-{
-  machine_mode mode, vmode;
-  rtx dest, op0, mask, x;
-
-  dest = operands[0];
-  op0 = operands[1];
-  mask = operands[3];
-
-  mode = GET_MODE (dest);
-  vmode = GET_MODE (mask);
-
-  dest = lowpart_subreg (vmode, dest, mode);
-  x = gen_rtx_AND (vmode, dest, mask);
-  emit_insn (gen_rtx_SET (dest, x));
-
-  if (op0 != CONST0_RTX (vmode))
-    {
-      x = gen_rtx_IOR (vmode, dest, op0);
-      emit_insn (gen_rtx_SET (dest, x));
-    }
-}
-
-/* Deconstruct a copysign operation into bit masks.  Operand 0 is variable,
-   so we have to do two masks.  */
-
-void
-ix86_split_copysign_var (rtx operands[])
-{
-  machine_mode mode, vmode;
-  rtx dest, scratch, op0, op1, mask, nmask, x;
-
-  dest = operands[0];
-  scratch = operands[1];
-  op0 = operands[2];
-  op1 = operands[3];
-  nmask = operands[4];
-  mask = operands[5];
-
-  mode = GET_MODE (dest);
-  vmode = GET_MODE (mask);
-
-  if (rtx_equal_p (op0, op1))
+  if (rtx_equal_p (operands[1], operands[2]))
     {
-      /* Shouldn't happen often (it's useless, obviously), but when it does
-	 we'd generate incorrect code if we continue below.  */
-      emit_move_insn (dest, op0);
+      emit_move_insn (operands[0], operands[1]);
       return;
     }
 
-  if (REG_P (mask) && REGNO (dest) == REGNO (mask))	/* alternative 0 */
-    {
-      gcc_assert (REGNO (op1) == REGNO (scratch));
-
-      x = gen_rtx_AND (vmode, scratch, mask);
-      emit_insn (gen_rtx_SET (scratch, x));
+  dest = lowpart_subreg (vmode, operands[0], mode);
+  op1 = lowpart_subreg (vmode, operands[2], mode);
+  mask = ix86_build_signbit_mask (vmode, 0, 0);
 
-      dest = mask;
-      op0 = lowpart_subreg (vmode, op0, mode);
-      x = gen_rtx_NOT (vmode, dest);
-      x = gen_rtx_AND (vmode, x, op0);
-      emit_insn (gen_rtx_SET (dest, x));
-    }
-  else
+  if (CONST_DOUBLE_P (operands[1]))
     {
-      if (REGNO (op1) == REGNO (scratch))		/* alternative 1,3 */
+      op0 = simplify_unary_operation (ABS, mode, operands[1], mode);
+      /* Optimize for 0, simplify b = copy_signf (0.0f, a) to b = mask & a.  */
+      if (op0 == CONST0_RTX (mode))
 	{
-	  x = gen_rtx_AND (vmode, scratch, mask);
-	}
-      else						/* alternative 2,4 */
-	{
-          gcc_assert (REGNO (mask) == REGNO (scratch));
-          op1 = lowpart_subreg (vmode, op1, mode);
-	  x = gen_rtx_AND (vmode, scratch, op1);
+	  emit_move_insn (dest, gen_rtx_AND (vmode, mask, op1));
+	  return;
 	}
-      emit_insn (gen_rtx_SET (scratch, x));
 
-      if (REGNO (op0) == REGNO (dest))			/* alternative 1,2 */
-	{
-	  dest = lowpart_subreg (vmode, op0, mode);
-	  x = gen_rtx_AND (vmode, dest, nmask);
-	}
-      else						/* alternative 3,4 */
-	{
-          gcc_assert (REGNO (nmask) == REGNO (dest));
-	  dest = nmask;
-	  op0 = lowpart_subreg (vmode, op0, mode);
-	  x = gen_rtx_AND (vmode, dest, op0);
-	}
-      emit_insn (gen_rtx_SET (dest, x));
+      if (GET_MODE_SIZE (mode) < 16)
+	op0 = ix86_build_const_vector (vmode, false, op0);
+      op0 = force_reg (vmode, op0);
     }
-
-  x = gen_rtx_IOR (vmode, dest, scratch);
-  emit_insn (gen_rtx_SET (dest, x));
+  else
+    op0 = lowpart_subreg (vmode, operands[1], mode);
+
+  op2 = gen_reg_rtx (vmode);
+  op3 = gen_reg_rtx (vmode);
+  emit_move_insn (op2, gen_rtx_AND (vmode,
+				    gen_rtx_NOT (vmode, mask),
+				    op0));
+  emit_move_insn (op3, gen_rtx_AND (vmode, mask, op1));
+  emit_move_insn (dest, gen_rtx_IOR (vmode, op2, op3));
 }
 
 /* Expand an xorsign operation.  */
@@ -2018,7 +2174,7 @@ void
 ix86_expand_xorsign (rtx operands[])
 {
   machine_mode mode, vmode;
-  rtx dest, op0, op1, mask;
+  rtx dest, op0, op1, mask, x, temp;
 
   dest = operands[0];
   op0 = operands[1];
@@ -2033,32 +2189,17 @@ ix86_expand_xorsign (rtx operands[])
   else
     gcc_unreachable ();
 
+  temp = gen_reg_rtx (vmode);
   mask = ix86_build_signbit_mask (vmode, 0, 0);
 
-  emit_insn (gen_xorsign3_1 (mode, dest, op0, op1, mask));
-}
-
-/* Deconstruct an xorsign operation into bit masks.  */
-
-void
-ix86_split_xorsign (rtx operands[])
-{
-  machine_mode mode, vmode;
-  rtx dest, op0, mask, x;
-
-  dest = operands[0];
-  op0 = operands[1];
-  mask = operands[3];
+  op1 = lowpart_subreg (vmode, op1, mode);
+  x = gen_rtx_AND (vmode, op1, mask);
+  emit_insn (gen_rtx_SET (temp, x));
 
-  mode = GET_MODE (dest);
-  vmode = GET_MODE (mask);
+  op0 = lowpart_subreg (vmode, op0, mode);
+  x = gen_rtx_XOR (vmode, temp, op0);
 
   dest = lowpart_subreg (vmode, dest, mode);
-  x = gen_rtx_AND (vmode, dest, mask);
-  emit_insn (gen_rtx_SET (dest, x));
-
-  op0 = lowpart_subreg (vmode, op0, mode);
-  x = gen_rtx_XOR (vmode, dest, op0);
   emit_insn (gen_rtx_SET (dest, x));
 }
 
@@ -2096,6 +2237,7 @@ ix86_expand_branch (enum rtx_code code, rtx op0, rtx op1, rtx label)
 
   switch (mode)
     {
+    case E_HFmode:
     case E_SFmode:
     case E_DFmode:
     case E_XFmode:
@@ -2409,7 +2551,7 @@ ix86_prepare_fp_compare_args (enum rtx_code code, rtx *pop0, rtx *pop1)
   bool unordered_compare = ix86_unordered_fp_compare (code);
   rtx op0 = *pop0, op1 = *pop1;
   machine_mode op_mode = GET_MODE (op0);
-  bool is_sse = TARGET_SSE_MATH && SSE_FLOAT_MODE_P (op_mode);
+  bool is_sse = SSE_FLOAT_MODE_SSEMATH_OR_HF_P (op_mode);
 
   /* All of the unordered compare instructions only work on registers.
      The same is true of the fcomi compare instructions.  The XFmode
@@ -2650,6 +2792,14 @@ ix86_expand_int_compare (enum rtx_code code, rtx op0, rtx op1)
   machine_mode cmpmode;
   rtx tmp, flags;
 
+  /* Swap operands to emit carry flag comparison.  */
+  if ((code == GTU || code == LEU)
+      && nonimmediate_operand (op1, VOIDmode))
+    {
+      std::swap (op0, op1);
+      code = swap_condition (code);
+    }
+
   cmpmode = SELECT_CC_MODE (code, op0, op1);
   flags = gen_rtx_REG (cmpmode, FLAGS_REG);
 
@@ -3469,6 +3619,37 @@ ix86_valid_mask_cmp_mode (machine_mode mode)
   return vector_size == 64 || TARGET_AVX512VL;
 }
 
+/* Return true if integer mask comparison should be used.  */
+static bool
+ix86_use_mask_cmp_p (machine_mode mode, machine_mode cmp_mode,
+		     rtx op_true, rtx op_false)
+{
+  int vector_size = GET_MODE_SIZE (mode);
+
+  if (vector_size < 16)
+    return false;
+  else if (vector_size == 64)
+    return true;
+
+  /* When op_true is NULL, op_false must be NULL, or vice versa.  */
+  gcc_assert (!op_true == !op_false);
+
+  /* When op_true/op_false is NULL or cmp_mode is not valid mask cmp mode,
+     vector dest is required.  */
+  if (!op_true || !ix86_valid_mask_cmp_mode (cmp_mode))
+    return false;
+
+  /* Exclude those that could be optimized in ix86_expand_sse_movcc.  */
+  if (op_false == CONST0_RTX (mode)
+      || op_true == CONST0_RTX (mode)
+      || (INTEGRAL_MODE_P (mode)
+	  && (op_true == CONSTM1_RTX (mode)
+	      || op_false == CONSTM1_RTX (mode))))
+    return false;
+
+  return true;
+}
+
 /* Expand an SSE comparison.  Return the register with the result.  */
 
 static rtx
@@ -3485,7 +3666,7 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
   bool maskcmp = false;
   rtx x;
 
-  if (ix86_valid_mask_cmp_mode (cmp_ops_mode))
+  if (ix86_use_mask_cmp_p (mode, cmp_ops_mode, op_true, op_false))
     {
       unsigned int nbits = GET_MODE_NUNITS (cmp_ops_mode);
       maskcmp = true;
@@ -3496,7 +3677,7 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
 
   cmp_op0 = force_reg (cmp_ops_mode, cmp_op0);
 
-  int (*op1_predicate)(rtx, machine_mode)
+  bool (*op1_predicate)(rtx, machine_mode)
     = VECTOR_MODE_P (cmp_ops_mode) ? vector_operand : nonimmediate_operand;
 
   if (!op1_predicate (cmp_op1, cmp_ops_mode))
@@ -3517,7 +3698,7 @@ ix86_expand_sse_cmp (rtx dest, enum rtx_code code, rtx cmp_op0, rtx cmp_op1,
 
   x = gen_rtx_fmt_ee (code, cmp_mode, cmp_op0, cmp_op1);
 
-  if (cmp_mode != mode && !maskcmp)
+  if (cmp_mode != mode)
     {
       x = force_reg (cmp_ops_mode, x);
       convert_move (dest, x, false);
@@ -3544,9 +3725,6 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
       return;
     }
 
-  /* In AVX512F the result of comparison is an integer mask.  */
-  bool maskcmp = mode != cmpmode && ix86_valid_mask_cmp_mode (mode);
-
   rtx t2, t3, x;
 
   /* If we have an integer mask and FP value then we need
@@ -3557,8 +3735,11 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
       cmp = gen_rtx_SUBREG (mode, cmp, 0);
     }
 
-  if (maskcmp)
+  /* In AVX512F the result of comparison is an integer mask.  */
+  if (mode != cmpmode
+      && GET_MODE_CLASS (cmpmode) == MODE_INT)
     {
+      gcc_assert (ix86_valid_mask_cmp_mode (mode));
       /* Using vector move with mask register.  */
       cmp = force_reg (cmpmode, cmp);
       /* Optimize for mask zero.  */
@@ -3568,17 +3749,11 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
 		  ? force_reg (mode, op_false) : op_false);
       if (op_true == CONST0_RTX (mode))
 	{
-	  rtx (*gen_not) (rtx, rtx);
-	  switch (cmpmode)
-	    {
-	    case E_QImode: gen_not = gen_knotqi; break;
-	    case E_HImode: gen_not = gen_knothi; break;
-	    case E_SImode: gen_not = gen_knotsi; break;
-	    case E_DImode: gen_not = gen_knotdi; break;
-	    default: gcc_unreachable ();
-	    }
 	  rtx n = gen_reg_rtx (cmpmode);
-	  emit_insn (gen_not (n, cmp));
+	  if (cmpmode == E_DImode && !TARGET_64BIT)
+	    emit_insn (gen_knotdi (n, cmp));
+	  else
+	    emit_insn (gen_rtx_SET (n, gen_rtx_fmt_e (NOT, cmpmode, cmp)));
 	  cmp = n;
 	  /* Reverse op_true op_false.  */
 	  std::swap (op_true, op_false);
@@ -3620,7 +3795,8 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
     {
       op_true = force_reg (mode, op_true);
 
-      if (!nonimmediate_operand (op_false, mode))
+      if (GET_MODE_SIZE (mode) < 16
+	  || !nonimmediate_operand (op_false, mode))
 	op_false = force_reg (mode, op_false);
 
       emit_insn (gen_rtx_SET (dest, gen_rtx_IF_THEN_ELSE (mode, cmp,
@@ -3639,6 +3815,13 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
 
   switch (mode)
     {
+    case E_V2SFmode:
+      if (TARGET_SSE4_1)
+	{
+	  gen = gen_mmx_blendvps;
+	  op_true = force_reg (mode, op_true);
+	}
+      break;
     case E_V4SFmode:
       if (TARGET_SSE4_1)
 	gen = gen_sse4_1_blendvps;
@@ -3661,8 +3844,38 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
 	  op_true = force_reg (mode, op_true);
 	}
       break;
+    case E_V8QImode:
+    case E_V4HImode:
+    case E_V2SImode:
+      if (TARGET_SSE4_1)
+	{
+	  op_true = force_reg (mode, op_true);
+
+	  gen = gen_mmx_pblendvb64;
+	  if (mode != V8QImode)
+	    d = gen_reg_rtx (V8QImode);
+	  op_false = gen_lowpart (V8QImode, op_false);
+	  op_true = gen_lowpart (V8QImode, op_true);
+	  cmp = gen_lowpart (V8QImode, cmp);
+	}
+      break;
+    case E_V4QImode:
+    case E_V2HImode:
+      if (TARGET_SSE4_1)
+	{
+	  op_true = force_reg (mode, op_true);
+
+	  gen = gen_mmx_pblendvb32;
+	  if (mode != V4QImode)
+	    d = gen_reg_rtx (V4QImode);
+	  op_false = gen_lowpart (V4QImode, op_false);
+	  op_true = gen_lowpart (V4QImode, op_true);
+	  cmp = gen_lowpart (V4QImode, cmp);
+	}
+      break;
     case E_V16QImode:
     case E_V8HImode:
+    case E_V8HFmode:
     case E_V4SImode:
     case E_V2DImode:
       if (TARGET_SSE4_1)
@@ -3685,6 +3898,7 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
       break;
     case E_V32QImode:
     case E_V16HImode:
+    case E_V16HFmode:
     case E_V8SImode:
     case E_V4DImode:
       if (TARGET_AVX2)
@@ -3704,6 +3918,9 @@ ix86_expand_sse_movcc (rtx dest, rtx cmp, rtx op_true, rtx op_false)
     case E_V32HImode:
       gen = gen_avx512bw_blendmv32hi;
       break;
+    case E_V32HFmode:
+      gen = gen_avx512bw_blendmv32hf;
+      break;
     case E_V16SImode:
       gen = gen_avx512f_blendmv16si;
       break;
@@ -3824,7 +4041,7 @@ ix86_expand_fp_movcc (rtx operands[])
   rtx op0 = XEXP (operands[1], 0);
   rtx op1 = XEXP (operands[1], 1);
 
-  if (TARGET_SSE_MATH && SSE_FLOAT_MODE_P (mode))
+  if (SSE_FLOAT_MODE_SSEMATH_OR_HF_P (mode))
     {
       machine_mode cmode;
 
@@ -4022,7 +4239,7 @@ ix86_expand_fp_vec_cmp (rtx operands[])
     }
   else
     cmp = ix86_expand_sse_cmp (operands[0], code, operands[2], operands[3],
-			       operands[1], operands[2]);
+			       NULL, NULL);
 
   if (operands[0] != cmp)
     emit_move_insn (operands[0], cmp);
@@ -4042,12 +4259,12 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
 
   /* XOP supports all of the comparisons on all 128-bit vector int types.  */
   if (TARGET_XOP
-      && (mode == V16QImode || mode == V8HImode
-	  || mode == V4SImode || mode == V2DImode))
+      && GET_MODE_CLASS (mode) == MODE_VECTOR_INT
+      && GET_MODE_SIZE (mode) <= 16)
     ;
   /* AVX512F supports all of the comparsions
      on all 128/256/512-bit vector int types.  */
-  else if (ix86_valid_mask_cmp_mode (mode))
+  else if (ix86_use_mask_cmp_p (data_mode, mode, op_true, op_false))
     ;
   else
     {
@@ -4167,16 +4384,44 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
 	      else if (code == GT && TARGET_SSE4_1)
 		gen = gen_sminv16qi3;
 	      break;
+	    case E_V8QImode:
+	      if (code == GTU && TARGET_SSE2)
+		gen = gen_uminv8qi3;
+	      else if (code == GT && TARGET_SSE4_1)
+		gen = gen_sminv8qi3;
+	      break;
+	    case E_V4QImode:
+	      if (code == GTU && TARGET_SSE2)
+		gen = gen_uminv4qi3;
+	      else if (code == GT && TARGET_SSE4_1)
+		gen = gen_sminv4qi3;
+	      break;
 	    case E_V8HImode:
 	      if (code == GTU && TARGET_SSE4_1)
 		gen = gen_uminv8hi3;
 	      else if (code == GT && TARGET_SSE2)
 		gen = gen_sminv8hi3;
 	      break;
+	    case E_V4HImode:
+	      if (code == GTU && TARGET_SSE4_1)
+		gen = gen_uminv4hi3;
+	      else if (code == GT && TARGET_SSE2)
+		gen = gen_sminv4hi3;
+	      break;
+	    case E_V2HImode:
+	      if (code == GTU && TARGET_SSE4_1)
+		gen = gen_uminv2hi3;
+	      else if (code == GT && TARGET_SSE2)
+		gen = gen_sminv2hi3;
+	      break;
 	    case E_V4SImode:
 	      if (TARGET_SSE4_1)
 		gen = (code == GTU) ? gen_uminv4si3 : gen_sminv4si3;
 	      break;
+	    case E_V2SImode:
+	      if (TARGET_SSE4_1)
+		gen = (code == GTU) ? gen_uminv2si3 : gen_sminv2si3;
+	      break;
 	    case E_V2DImode:
 	      if (TARGET_AVX512VL)
 		{
@@ -4217,6 +4462,7 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
 	    case E_V8SImode:
 	    case E_V4DImode:
 	    case E_V4SImode:
+	    case E_V2SImode:
 	    case E_V2DImode:
 		{
 		  rtx t1, t2, mask;
@@ -4241,7 +4487,11 @@ ix86_expand_int_sse_cmp (rtx dest, enum rtx_code code, rtx cop0, rtx cop1,
 	    case E_V32QImode:
 	    case E_V16HImode:
 	    case E_V16QImode:
+	    case E_V8QImode:
+	    case E_V4QImode:
 	    case E_V8HImode:
+	    case E_V4HImode:
+	    case E_V2HImode:
 	      /* Perform a parallel unsigned saturating subtraction.  */
 	      x = gen_reg_rtx (mode);
 	      emit_insn (gen_rtx_SET
@@ -4431,6 +4681,18 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
 
   switch (mode)
     {
+    case E_V16QImode:
+      if (TARGET_AVX512VL && TARGET_AVX512VBMI)
+	gen = gen_avx512vl_vpermt2varv16qi3;
+      break;
+    case E_V32QImode:
+      if (TARGET_AVX512VL && TARGET_AVX512VBMI)
+	gen = gen_avx512vl_vpermt2varv32qi3;
+      break;
+    case E_V64QImode:
+      if (TARGET_AVX512VBMI)
+	gen = gen_avx512bw_vpermt2varv64qi3;
+      break;
     case E_V8HImode:
       if (TARGET_AVX512VL && TARGET_AVX512BW)
 	gen = gen_avx512vl_vpermt2varv8hi3;
@@ -4439,10 +4701,6 @@ ix86_expand_vec_perm_vpermt2 (rtx target, rtx mask, rtx op0, rtx op1,
       if (TARGET_AVX512VL && TARGET_AVX512BW)
 	gen = gen_avx512vl_vpermt2varv16hi3;
       break;
-    case E_V64QImode:
-      if (TARGET_AVX512VBMI)
-	gen = gen_avx512bw_vpermt2varv64qi3;
-      break;
     case E_V32HImode:
       if (TARGET_AVX512BW)
 	gen = gen_avx512bw_vpermt2varv32hi3;
@@ -5022,6 +5280,24 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
 	  else
 	    unpack = gen_sse4_1_sign_extendv2siv2di2;
 	  break;
+	case E_V8QImode:
+	  if (unsigned_p)
+	    unpack = gen_sse4_1_zero_extendv4qiv4hi2;
+	  else
+	    unpack = gen_sse4_1_sign_extendv4qiv4hi2;
+	  break;
+	case E_V4HImode:
+	  if (unsigned_p)
+	    unpack = gen_sse4_1_zero_extendv2hiv2si2;
+	  else
+	    unpack = gen_sse4_1_sign_extendv2hiv2si2;
+	  break;
+	case E_V4QImode:
+	  if (unsigned_p)
+	    unpack = gen_sse4_1_zero_extendv2qiv2hi2;
+	  else
+	    unpack = gen_sse4_1_sign_extendv2qiv2hi2;
+	  break;
 	default:
 	  gcc_unreachable ();
 	}
@@ -5033,10 +5309,30 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
 	}
       else if (high_p)
 	{
-	  /* Shift higher 8 bytes to lower 8 bytes.  */
-	  tmp = gen_reg_rtx (V1TImode);
-	  emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
-					 GEN_INT (64)));
+	  switch (GET_MODE_SIZE (imode))
+	    {
+	    case 16:
+	      /* Shift higher 8 bytes to lower 8 bytes.  */
+	      tmp = gen_reg_rtx (V1TImode);
+	      emit_insn (gen_sse2_lshrv1ti3 (tmp, gen_lowpart (V1TImode, src),
+					     GEN_INT (64)));
+	      break;
+	    case 8:
+	      /* Shift higher 4 bytes to lower 4 bytes.  */
+	      tmp = gen_reg_rtx (V1DImode);
+	      emit_insn (gen_mmx_lshrv1di3 (tmp, gen_lowpart (V1DImode, src),
+					    GEN_INT (32)));
+	      break;
+	    case 4:
+	      /* Shift higher 2 bytes to lower 2 bytes.  */
+	      tmp = gen_reg_rtx (V1SImode);
+	      emit_insn (gen_mmx_lshrv1si3 (tmp, gen_lowpart (V1SImode, src),
+					    GEN_INT (16)));
+	      break;
+	    default:
+	      gcc_unreachable ();
+	    }
+
 	  tmp = gen_lowpart (imode, tmp);
 	}
       else
@@ -5068,6 +5364,24 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
 	  else
 	    unpack = gen_vec_interleave_lowv4si;
 	  break;
+	case E_V8QImode:
+	  if (high_p)
+	    unpack = gen_mmx_punpckhbw;
+	  else
+	    unpack = gen_mmx_punpcklbw;
+	  break;
+	case E_V4HImode:
+	  if (high_p)
+	    unpack = gen_mmx_punpckhwd;
+	  else
+	    unpack = gen_mmx_punpcklwd;
+	  break;
+	case E_V4QImode:
+	  if (high_p)
+	    unpack = gen_mmx_punpckhbw_low;
+	  else
+	    unpack = gen_mmx_punpcklbw_low;
+	  break;
 	default:
 	  gcc_unreachable ();
 	}
@@ -5084,6 +5398,45 @@ ix86_expand_sse_unpack (rtx dest, rtx src, bool unsigned_p, bool high_p)
     }
 }
 
+/* Return true if mem is pool constant which contains a const_vector
+   perm index, assign the index to PERM.  */
+bool
+ix86_extract_perm_from_pool_constant (int* perm, rtx mem)
+{
+  machine_mode mode = GET_MODE (mem);
+  int nelt = GET_MODE_NUNITS (mode);
+
+  if (!INTEGRAL_MODE_P (mode))
+    return false;
+
+    /* Needs to be constant pool.  */
+  if (!(MEM_P (mem))
+      || !SYMBOL_REF_P (XEXP (mem, 0))
+      || !CONSTANT_POOL_ADDRESS_P (XEXP (mem, 0)))
+   return false;
+
+  rtx constant = get_pool_constant (XEXP (mem, 0));
+
+  if (GET_CODE (constant) != CONST_VECTOR)
+    return false;
+
+  /* There could be some rtx like
+     (mem/u/c:V16QI (symbol_ref/u:DI ("*.LC1")))
+     but with "*.LC1" refer to V2DI constant vector.  */
+  if (GET_MODE (constant) != mode)
+    {
+      constant = simplify_subreg (mode, constant, GET_MODE (constant), 0);
+
+      if (constant == nullptr || GET_CODE (constant) != CONST_VECTOR)
+	return false;
+    }
+
+  for (int i = 0; i != nelt; i++)
+    perm[i] = UINTVAL (XVECEXP (constant, 0, i));
+
+  return true;
+}
+
 /* Split operands 0 and 1 into half-mode parts.  Similar to split_double_mode,
    but works for floating pointer parameters and nonoffsetable memories.
    For pushes, it returns just stack offsets; the values will be saved
@@ -5954,6 +6307,7 @@ expand_set_or_cpymem_via_rep (rtx destmem, rtx srcmem,
   /* If possible, it is shorter to use rep movs.
      TODO: Maybe it is better to move this logic to decide_alg.  */
   if (mode == QImode && CONST_INT_P (count) && !(INTVAL (count) & 3)
+      && !TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
       && (!issetmem || orig_value == const0_rtx))
     mode = SImode;
 
@@ -6962,7 +7316,12 @@ decide_alg (HOST_WIDE_INT count, HOST_WIDE_INT expected_size,
 		  else if (!any_alg_usable_p)
 		    break;
 		}
-	      else if (alg_usable_p (candidate, memset, have_as))
+	      else if (alg_usable_p (candidate, memset, have_as)
+		       && !(TARGET_PREFER_KNOWN_REP_MOVSB_STOSB
+			    && candidate == rep_prefix_1_byte
+			    /* NB: If min_size != max_size, size is
+			       unknown.  */
+			    && min_size != max_size))
 		{
 		  *noalign = algs->size[i].noalign;
 		  return candidate;
@@ -7027,7 +7386,7 @@ decide_alignment (int align,
   desired_align = GET_MODE_SIZE (move_mode);
   /* PentiumPro has special logic triggering for 8 byte aligned blocks.
      copying whole cacheline at once.  */
-  if (TARGET_PENTIUMPRO
+  if (TARGET_CPU_P (PENTIUMPRO)
       && (alg == rep_prefix_4_byte || alg == rep_prefix_1_byte))
     desired_align = 8;
 
@@ -8027,6 +8386,7 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
     pop = NULL;
   gcc_assert (!TARGET_64BIT || !pop);
 
+  rtx addr = XEXP (fnaddr, 0);
   if (TARGET_MACHO && !TARGET_64BIT)
     {
 #if TARGET_MACHO
@@ -8039,7 +8399,6 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
       /* Static functions and indirect calls don't need the pic register.  Also,
 	 check if PLT was explicitly avoided via no-plt or "noplt" attribute, making
 	 it an indirect call.  */
-      rtx addr = XEXP (fnaddr, 0);
       if (flag_pic
 	  && GET_CODE (addr) == SYMBOL_REF
 	  && !SYMBOL_REF_LOCAL_P (addr))
@@ -8202,6 +8561,20 @@ ix86_expand_call (rtx retval, rtx fnaddr, rtx callarg1,
 	}
     }
 
+  if (TARGET_MACHO && TARGET_64BIT && !sibcall
+      && ((GET_CODE (addr) == SYMBOL_REF && !SYMBOL_REF_LOCAL_P (addr))
+	  || !fndecl || TREE_PUBLIC (fndecl)))
+    {
+      /* We allow public functions defined in a TU to bind locally for PIC
+	 code (the default) on 64bit Mach-O.
+	 If such functions are not inlined, we cannot tell at compile-time if
+	 they will be called via the lazy symbol resolver (this can depend on
+	 options given at link-time).  Therefore, we must assume that the lazy
+	 resolver could be used which clobbers R11 and R10.  */
+      clobber_reg (&use, gen_rtx_REG (DImode, R11_REG));
+      clobber_reg (&use, gen_rtx_REG (DImode, R10_REG));
+    }
+
   if (vec_len > 1)
     call = gen_rtx_PARALLEL (VOIDmode, gen_rtvec_v (vec_len, vec));
   rtx_insn *call_insn = emit_call_insn (call);
@@ -8634,11 +9007,6 @@ ix86_expand_sse_comi (const struct builtin_description *d, tree exp,
   if (VECTOR_MODE_P (mode1))
     op1 = safe_vector_operand (op1, mode1);
 
-  /* Swap operands if we have a comparison that isn't available in
-     hardware.  */
-  if (d->flag & BUILTIN_DESC_SWAP_OPERANDS)
-    std::swap (op0, op1);
-
   target = gen_reg_rtx (SImode);
   emit_move_insn (target, const0_rtx);
   target = gen_rtx_SUBREG (QImode, target, 0);
@@ -9159,12 +9527,14 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case FLOAT128_FTYPE_FLOAT128_FLOAT128:
     case V16QI_FTYPE_V16QI_V16QI:
     case V16QI_FTYPE_V8HI_V8HI:
+    case V16HF_FTYPE_V16HF_V16HF:
     case V16SF_FTYPE_V16SF_V16SF:
     case V8QI_FTYPE_V8QI_V8QI:
     case V8QI_FTYPE_V4HI_V4HI:
     case V8HI_FTYPE_V8HI_V8HI:
     case V8HI_FTYPE_V16QI_V16QI:
     case V8HI_FTYPE_V4SI_V4SI:
+    case V8HF_FTYPE_V8HF_V8HF:
     case V8SF_FTYPE_V8SF_V8SF:
     case V8SF_FTYPE_V8SF_V8SI:
     case V8DF_FTYPE_V8DF_V8DF:
@@ -9574,6 +9944,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V16HI_FTYPE_V8SI_V8SI_V16HI_UHI:
     case V8HI_FTYPE_V4SI_V4SI_V8HI_UQI:
     case V4DF_FTYPE_V4DF_V4DI_V4DF_UQI:
+    case V32HF_FTYPE_V32HF_V32HF_V32HF_USI:
     case V8SF_FTYPE_V8SF_V8SI_V8SF_UQI:
     case V4SF_FTYPE_V4SF_V4SI_V4SF_UQI:
     case V2DF_FTYPE_V2DF_V2DI_V2DF_UQI:
@@ -9591,6 +9962,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V8HI_FTYPE_V8HI_V8HI_V8HI_UQI:
     case V8SI_FTYPE_V8SI_V8SI_V8SI_UQI:
     case V4SI_FTYPE_V4SI_V4SI_V4SI_UQI:
+    case V16HF_FTYPE_V16HF_V16HF_V16HF_UHI:
     case V8SF_FTYPE_V8SF_V8SF_V8SF_UQI:
     case V16QI_FTYPE_V16QI_V16QI_V16QI_UHI:
     case V16HI_FTYPE_V16HI_V16HI_V16HI_UHI:
@@ -9598,6 +9970,7 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI:
     case V4DI_FTYPE_V4DI_V4DI_V4DI_UQI:
     case V4DF_FTYPE_V4DF_V4DF_V4DF_UQI:
+    case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI:
     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI:
     case V8DF_FTYPE_V8DF_V8DF_V8DF_UQI:
     case V8DF_FTYPE_V8DF_V8DI_V8DF_UQI:
@@ -9626,14 +9999,17 @@ ix86_expand_args_builtin (const struct builtin_description *d,
     case UQI_FTYPE_V8SI_V8SI_INT_UQI:
     case QI_FTYPE_V4DF_V4DF_INT_UQI:
     case QI_FTYPE_V8SF_V8SF_INT_UQI:
+    case UHI_FTYPE_V16HF_V16HF_INT_UHI:
     case UQI_FTYPE_V2DI_V2DI_INT_UQI:
     case UQI_FTYPE_V4SI_V4SI_INT_UQI:
     case UQI_FTYPE_V2DF_V2DF_INT_UQI:
     case UQI_FTYPE_V4SF_V4SF_INT_UQI:
+    case UQI_FTYPE_V8HF_V8HF_INT_UQI:
     case UDI_FTYPE_V64QI_V64QI_INT_UDI:
     case USI_FTYPE_V32QI_V32QI_INT_USI:
     case UHI_FTYPE_V16QI_V16QI_INT_UHI:
     case USI_FTYPE_V32HI_V32HI_INT_USI:
+    case USI_FTYPE_V32HF_V32HF_INT_USI:
     case UHI_FTYPE_V16HI_V16HI_INT_UHI:
     case UQI_FTYPE_V8HI_V8HI_INT_UQI:
       nargs = 4;
@@ -9917,6 +10293,9 @@ ix86_expand_args_builtin (const struct builtin_description *d,
 	      case CODE_FOR_avx512f_cmpv16sf3_mask:
 	      case CODE_FOR_avx512f_vmcmpv2df3_mask:
 	      case CODE_FOR_avx512f_vmcmpv4sf3_mask:
+	      case CODE_FOR_avx512bw_cmpv32hf3_mask:
+	      case CODE_FOR_avx512vl_cmpv16hf3_mask:
+	      case CODE_FOR_avx512fp16_cmpv8hf3_mask:
 		error ("the last argument must be a 5-bit immediate");
 		return const0_rtx;
 
@@ -10274,6 +10653,8 @@ ix86_expand_round_builtin (const struct builtin_description *d,
     case INT_FTYPE_V4SF_INT:
       nargs = 2;
       break;
+    case V32HF_FTYPE_V32HF_V32HF_INT:
+    case V8HF_FTYPE_V8HF_V8HF_INT:
     case V4SF_FTYPE_V4SF_UINT_INT:
     case V4SF_FTYPE_V4SF_UINT64_INT:
     case V2DF_FTYPE_V2DF_UINT64_INT:
@@ -10314,12 +10695,14 @@ ix86_expand_round_builtin (const struct builtin_description *d,
     case V2DF_FTYPE_V2DF_V2DF_V2DF_UQI_INT:
     case V4SF_FTYPE_V4SF_V4SF_V4SF_UQI_INT:
     case V16SF_FTYPE_V16SF_V16SF_V16SF_HI_INT:
+    case V32HF_FTYPE_V32HF_V32HF_V32HF_USI_INT:
     case V2DF_FTYPE_V2DF_V2DF_V2DF_QI_INT:
     case V2DF_FTYPE_V2DF_V4SF_V2DF_QI_INT:
     case V2DF_FTYPE_V2DF_V4SF_V2DF_UQI_INT:
     case V4SF_FTYPE_V4SF_V4SF_V4SF_QI_INT:
     case V4SF_FTYPE_V4SF_V2DF_V4SF_QI_INT:
     case V4SF_FTYPE_V4SF_V2DF_V4SF_UQI_INT:
+    case V8HF_FTYPE_V8HF_V8HF_V8HF_UQI_INT:
       nargs = 5;
       break;
     case V16SF_FTYPE_V16SF_INT_V16SF_HI_INT:
@@ -10333,6 +10716,8 @@ ix86_expand_round_builtin (const struct builtin_description *d,
     case UQI_FTYPE_V2DF_V2DF_INT_UQI_INT:
     case UHI_FTYPE_V16SF_V16SF_INT_UHI_INT:
     case UQI_FTYPE_V4SF_V4SF_INT_UQI_INT:
+    case USI_FTYPE_V32HF_V32HF_INT_USI_INT:
+    case UQI_FTYPE_V8HF_V8HF_INT_UQI_INT:
       nargs_constant = 3;
       nargs = 5;
       break;
@@ -10388,6 +10773,8 @@ ix86_expand_round_builtin (const struct builtin_description *d,
 		case CODE_FOR_avx512f_cmpv16sf3_mask_round:
 		case CODE_FOR_avx512f_vmcmpv2df3_mask_round:
 		case CODE_FOR_avx512f_vmcmpv4sf3_mask_round:
+		case CODE_FOR_avx512f_vmcmpv8hf3_mask_round:
+		case CODE_FOR_avx512bw_cmpv32hf3_mask_round:
 		  error ("the immediate argument must be a 5-bit immediate");
 		  return const0_rtx;
 		default:
@@ -10832,11 +11219,12 @@ ix86_expand_special_args_builtin (const struct builtin_description *d,
 
 	  op = fixup_modeless_constant (op, mode);
 
-	  /* NB: 3-operands load implied it's a mask load,
+	  /* NB: 3-operands load implied it's a mask load or v{p}expand*,
 	     and that mask operand shoud be at the end.
 	     Keep all-ones mask which would be simplified by the expander.  */
 	  if (nargs == 3 && i == 2 && klass == load
-	      && constm1_operand (op, mode))
+	      && constm1_operand (op, mode)
+	      && insn_p->operand[i].predicate (op, mode))
 	    ;
 	  else if (GET_MODE (op) == mode || GET_MODE (op) == VOIDmode)
 	    op = copy_to_mode_reg (mode, op);
@@ -11407,10 +11795,24 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
       if (target == 0)
 	target = gen_reg_rtx (QImode);
 
-      pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
-			const0_rtx);
-      emit_insn (gen_rtx_SET (target, pat));
+      /* NB: For aesenc/aesdec keylocker insn, ZF will be set when runtime
+	 error occurs. Then the output should be cleared for safety. */
+      rtx_code_label *ok_label;
+      rtx tmp;
+
+      tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
+      pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
+      ok_label = gen_label_rtx ();
+      emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
+			       true, ok_label);
+      /* Usually the runtime error seldom occur, so predict OK path as
+	 hotspot to optimize it as fallthrough block. */
+      predict_jump (REG_BR_PROB_BASE * 90 / 100);
+
+      emit_insn (gen_rtx_SET (op1, const0_rtx));
 
+      emit_label (ok_label);
+      emit_insn (gen_rtx_SET (target, pat));
       emit_insn (gen_rtx_SET (op0, op1));
 
       return target;
@@ -11465,8 +11867,17 @@ ix86_expand_builtin (tree exp, rtx target, rtx subtarget,
       if (target == 0)
 	target = gen_reg_rtx (QImode);
 
-      pat = gen_rtx_EQ (QImode, gen_rtx_REG (CCZmode, FLAGS_REG),
-			const0_rtx);
+      tmp = gen_rtx_REG (CCZmode, FLAGS_REG);
+      pat = gen_rtx_EQ (QImode, tmp, const0_rtx);
+      ok_label = gen_label_rtx ();
+      emit_cmp_and_jump_insns (tmp, const0_rtx, NE, 0, GET_MODE (tmp),
+			       true, ok_label);
+      predict_jump (REG_BR_PROB_BASE * 90 / 100);
+
+      for (i = 0; i < 8; i++)
+	emit_insn (gen_rtx_SET (xmm_regs[i], const0_rtx));
+
+      emit_label (ok_label);
       emit_insn (gen_rtx_SET (target, pat));
 
       for (i = 0; i < 8; i++)
@@ -13206,6 +13617,14 @@ rdseed_step:
 					       target);
     }
 
+  if (fcode >= IX86_BUILTIN__BDESC_PURE_ARGS_FIRST
+      && fcode <= IX86_BUILTIN__BDESC_PURE_ARGS_LAST)
+    {
+      i = fcode - IX86_BUILTIN__BDESC_PURE_ARGS_FIRST;
+      return ix86_expand_special_args_builtin (bdesc_pure_args + i, exp,
+					       target);
+    }
+
   if (fcode >= IX86_BUILTIN__BDESC_ARGS_FIRST
       && fcode <= IX86_BUILTIN__BDESC_ARGS_LAST)
     {
@@ -13552,7 +13971,7 @@ static bool expand_vec_perm_1 (struct expand_vec_perm_d *d);
 /* A subroutine of ix86_expand_vector_init.  Store into TARGET a vector
    with all elements equal to VAR.  Return true if successful.  */
 
-static bool
+bool
 ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 				   rtx target, rtx val)
 {
@@ -13595,7 +14014,21 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 	}
       goto widen;
 
+    case E_V2HImode:
+      if (TARGET_SSE2)
+	{
+	  rtx x;
+
+	  val = gen_lowpart (SImode, val);
+	  x = gen_rtx_TRUNCATE (HImode, val);
+	  x = gen_rtx_VEC_DUPLICATE (mode, x);
+	  emit_insn (gen_rtx_SET (target, x));
+	  return true;
+	}
+      return false;
+
     case E_V8QImode:
+    case E_V4QImode:
       if (!mmx_ok)
 	return false;
       goto widen;
@@ -13652,10 +14085,17 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 	wsmode = GET_MODE_INNER (wvmode);
 
 	val = convert_modes (wsmode, smode, val, true);
-	x = expand_simple_binop (wsmode, ASHIFT, val,
-				 GEN_INT (GET_MODE_BITSIZE (smode)),
-				 NULL_RTX, 1, OPTAB_LIB_WIDEN);
-	val = expand_simple_binop (wsmode, IOR, val, x, x, 1, OPTAB_LIB_WIDEN);
+
+	if (smode == QImode && !TARGET_PARTIAL_REG_STALL)
+	  emit_insn (gen_insv_1 (wsmode, val, val));
+	else
+	  {
+	    x = expand_simple_binop (wsmode, ASHIFT, val,
+				     GEN_INT (GET_MODE_BITSIZE (smode)),
+				     NULL_RTX, 1, OPTAB_LIB_WIDEN);
+	    val = expand_simple_binop (wsmode, IOR, val, x, x, 1,
+				       OPTAB_LIB_WIDEN);
+	  }
 
 	x = gen_reg_rtx (wvmode);
 	ok = ix86_expand_vector_init_duplicate (mmx_ok, wvmode, x, val);
@@ -13698,6 +14138,11 @@ ix86_expand_vector_init_duplicate (bool mmx_ok, machine_mode mode,
 	}
       return true;
 
+    case E_V8HFmode:
+    case E_V16HFmode:
+    case E_V32HFmode:
+      return ix86_vector_duplicate_value (mode, target, val);
+
     default:
       return false;
     }
@@ -13741,6 +14186,9 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
     case E_V4HImode:
       use_vector_set = TARGET_SSE || TARGET_3DNOW_A;
       break;
+    case E_V4QImode:
+      use_vector_set = TARGET_SSE4_1;
+      break;
     case E_V32QImode:
     case E_V16HImode:
       use_vector_set = TARGET_AVX;
@@ -13779,6 +14227,18 @@ ix86_expand_vector_init_one_nonzero (bool mmx_ok, machine_mode mode,
       use_vector_set = TARGET_AVX512F && TARGET_64BIT && one_var == 0;
       gen_vec_set_0 = gen_vec_setv8di_0;
       break;
+    case E_V8HFmode:
+      use_vector_set = TARGET_AVX512FP16 && one_var == 0;
+      gen_vec_set_0 = gen_vec_setv8hf_0;
+      break;
+    case E_V16HFmode:
+      use_vector_set = TARGET_AVX512FP16 && one_var == 0;
+      gen_vec_set_0 = gen_vec_setv16hf_0;
+      break;
+    case E_V32HFmode:
+      use_vector_set = TARGET_AVX512FP16 && one_var == 0;
+      gen_vec_set_0 = gen_vec_setv32hf_0;
+      break;
     default:
       break;
     }
@@ -13928,6 +14388,8 @@ ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
       if (!TARGET_64BIT)
 	return false;
       /* FALLTHRU */
+    case E_V8HFmode:
+    case E_V16HFmode:
     case E_V4DFmode:
     case E_V8SFmode:
     case E_V8SImode:
@@ -13949,6 +14411,10 @@ ix86_expand_vector_init_one_var (bool mmx_ok, machine_mode mode,
 	break;
       wmode = V4HImode;
       goto widen;
+    case E_V4QImode:
+      if (TARGET_SSE4_1)
+	break;
+      wmode = V2HImode;
     widen:
       /* There's no way to set one QImode entry easily.  Combine
 	 the variable value with its adjacent constant value, and
@@ -14004,6 +14470,9 @@ ix86_expand_vector_init_concat (machine_mode mode,
     case 2:
       switch (mode)
 	{
+	case E_V32HFmode:
+	  half_mode = V16HFmode;
+	  break;
 	case E_V16SImode:
 	  half_mode = V8SImode;
 	  break;
@@ -14016,6 +14485,9 @@ ix86_expand_vector_init_concat (machine_mode mode,
 	case E_V8DFmode:
 	  half_mode = V4DFmode;
 	  break;
+	case E_V16HFmode:
+	  half_mode = V8HFmode;
+	  break;
 	case E_V8SImode:
 	  half_mode = V4SImode;
 	  break;
@@ -14158,13 +14630,22 @@ ix86_expand_vector_init_interleave (machine_mode mode,
 {
   machine_mode first_imode, second_imode, third_imode, inner_mode;
   int i, j;
-  rtx op0, op1;
+  rtx op, op0, op1;
   rtx (*gen_load_even) (rtx, rtx, rtx);
   rtx (*gen_interleave_first_low) (rtx, rtx, rtx);
   rtx (*gen_interleave_second_low) (rtx, rtx, rtx);
 
   switch (mode)
     {
+    case E_V8HFmode:
+      gen_load_even = gen_vec_setv8hf;
+      gen_interleave_first_low = gen_vec_interleave_lowv4si;
+      gen_interleave_second_low = gen_vec_interleave_lowv2di;
+      inner_mode = HFmode;
+      first_imode = V4SImode;
+      second_imode = V2DImode;
+      third_imode = VOIDmode;
+      break;
     case E_V8HImode:
       gen_load_even = gen_vec_setv8hi;
       gen_interleave_first_low = gen_vec_interleave_lowv4si;
@@ -14189,9 +14670,19 @@ ix86_expand_vector_init_interleave (machine_mode mode,
 
   for (i = 0; i < n; i++)
     {
+      op = ops [i + i];
+      if (inner_mode == HFmode)
+	{
+	  /* Convert HFmode to HImode.  */
+	  op1 = gen_reg_rtx (HImode);
+	  op1 = gen_rtx_SUBREG (HImode, force_reg (HFmode, op), 0);
+	  op = gen_reg_rtx (HImode);
+	  emit_move_insn (op, op1);
+	}
+
       /* Extend the odd elment to SImode using a paradoxical SUBREG.  */
       op0 = gen_reg_rtx (SImode);
-      emit_move_insn (op0, gen_lowpart (SImode, ops [i + i]));
+      emit_move_insn (op0, gen_lowpart (SImode, op));
 
       /* Insert the SImode value as low element of V4SImode vector. */
       op1 = gen_reg_rtx (V4SImode);
@@ -14328,6 +14819,10 @@ ix86_expand_vector_init_general (bool mmx_ok, machine_mode mode,
       half_mode = V8HImode;
       goto half;
 
+    case E_V16HFmode:
+      half_mode = V8HFmode;
+      goto half;
+
 half:
       n = GET_MODE_NUNITS (mode);
       for (i = 0; i < n; i++)
@@ -14351,6 +14846,11 @@ half:
       half_mode = V16HImode;
       goto quarter;
 
+    case E_V32HFmode:
+      quarter_mode = V8HFmode;
+      half_mode = V16HFmode;
+      goto quarter;
+
 quarter:
       n = GET_MODE_NUNITS (mode);
       for (i = 0; i < n; i++)
@@ -14387,6 +14887,9 @@ quarter:
 	 move from GPR to SSE register directly.  */
       if (!TARGET_INTER_UNIT_MOVES_TO_VEC)
 	break;
+      /* FALLTHRU */
+
+    case E_V8HFmode:
 
       n = GET_MODE_NUNITS (mode);
       for (i = 0; i < n; i++)
@@ -14396,6 +14899,9 @@ quarter:
 
     case E_V4HImode:
     case E_V8QImode:
+
+    case E_V2HImode:
+    case E_V4QImode:
       break;
 
     default:
@@ -14404,12 +14910,14 @@ quarter:
 
     {
       int i, j, n_elts, n_words, n_elt_per_word;
-      machine_mode inner_mode;
+      machine_mode tmp_mode, inner_mode;
       rtx words[4], shift;
 
+      tmp_mode = (GET_MODE_SIZE (mode) < UNITS_PER_WORD) ? SImode : word_mode;
+
       inner_mode = GET_MODE_INNER (mode);
       n_elts = GET_MODE_NUNITS (mode);
-      n_words = GET_MODE_SIZE (mode) / UNITS_PER_WORD;
+      n_words = GET_MODE_SIZE (mode) / GET_MODE_SIZE (tmp_mode);
       n_elt_per_word = n_elts / n_words;
       shift = GEN_INT (GET_MODE_BITSIZE (inner_mode));
 
@@ -14420,15 +14928,15 @@ quarter:
 	  for (j = 0; j < n_elt_per_word; ++j)
 	    {
 	      rtx elt = XVECEXP (vals, 0, (i+1)*n_elt_per_word - j - 1);
-	      elt = convert_modes (word_mode, inner_mode, elt, true);
+	      elt = convert_modes (tmp_mode, inner_mode, elt, true);
 
 	      if (j == 0)
 		word = elt;
 	      else
 		{
-		  word = expand_simple_binop (word_mode, ASHIFT, word, shift,
+		  word = expand_simple_binop (tmp_mode, ASHIFT, word, shift,
 					      word, 1, OPTAB_LIB_WIDEN);
-		  word = expand_simple_binop (word_mode, IOR, word, elt,
+		  word = expand_simple_binop (tmp_mode, IOR, word, elt,
 					      word, 1, OPTAB_LIB_WIDEN);
 		}
 	    }
@@ -14442,14 +14950,14 @@ quarter:
 	{
 	  rtx tmp = gen_reg_rtx (mode);
 	  emit_clobber (tmp);
-	  emit_move_insn (gen_lowpart (word_mode, tmp), words[0]);
-	  emit_move_insn (gen_highpart (word_mode, tmp), words[1]);
+	  emit_move_insn (gen_lowpart (tmp_mode, tmp), words[0]);
+	  emit_move_insn (gen_highpart (tmp_mode, tmp), words[1]);
 	  emit_move_insn (target, tmp);
 	}
       else if (n_words == 4)
 	{
 	  rtx tmp = gen_reg_rtx (V4SImode);
-	  gcc_assert (word_mode == SImode);
+	  gcc_assert (tmp_mode == SImode);
 	  vals = gen_rtx_PARALLEL (V4SImode, gen_rtvec_v (4, words));
 	  ix86_expand_vector_init_general (false, V4SImode, tmp, vals);
 	  emit_move_insn (target, gen_lowpart (mode, tmp));
@@ -14482,11 +14990,15 @@ ix86_expand_vector_init (bool mmx_ok, rtx target, rtx vals)
       if (GET_MODE_NUNITS (GET_MODE (x)) * 2 == n_elts)
 	{
 	  rtx ops[2] = { XVECEXP (vals, 0, 0), XVECEXP (vals, 0, 1) };
-	  if (inner_mode == QImode || inner_mode == HImode)
+	  if (inner_mode == QImode
+	      || inner_mode == HImode
+	      || inner_mode == TImode)
 	    {
 	      unsigned int n_bits = n_elts * GET_MODE_SIZE (inner_mode);
-	      mode = mode_for_vector (SImode, n_bits / 4).require ();
-	      inner_mode = mode_for_vector (SImode, n_bits / 8).require ();
+	      scalar_mode elt_mode = inner_mode == TImode ? DImode : SImode;
+	      n_bits /= GET_MODE_SIZE (elt_mode);
+	      mode = mode_for_vector (elt_mode, n_bits).require ();
+	      inner_mode = mode_for_vector (elt_mode, n_bits / 2).require ();
 	      ops[0] = gen_lowpart (inner_mode, ops[0]);
 	      ops[1] = gen_lowpart (inner_mode, ops[1]);
 	      subtarget = gen_reg_rtx (mode);
@@ -14613,6 +15125,9 @@ ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
 	case E_V8DFmode:
 	  cmp_mode = V8DImode;
 	  break;
+	case E_V2SFmode:
+	  cmp_mode = V2SImode;
+	  break;
 	case E_V4SFmode:
 	  cmp_mode = V4SImode;
 	  break;
@@ -14622,6 +15137,16 @@ ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
 	case E_V16SFmode:
 	  cmp_mode = V16SImode;
 	  break;
+	/* TARGET_AVX512FP16 implies TARGET_AVX512BW.  */
+	case E_V8HFmode:
+	  cmp_mode = V8HImode;
+	  break;
+	case E_V16HFmode:
+	  cmp_mode = V16HImode;
+	  break;
+	case E_V32HFmode:
+	  cmp_mode = V32HImode;
+	  break;
 	default:
 	  gcc_unreachable ();
 	}
@@ -14634,9 +15159,11 @@ ix86_expand_vector_set_var (rtx target, rtx val, rtx idx)
   idxv = gen_reg_rtx (cmp_mode);
   idx_tmp = convert_to_mode (GET_MODE_INNER (cmp_mode), idx, 1);
 
-  ok = ix86_expand_vector_init_duplicate (false, mode, valv, val);
+  ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
+					  mode, valv, val);
   gcc_assert (ok);
-  ok = ix86_expand_vector_init_duplicate (false, cmp_mode, idxv, idx_tmp);
+  ok = ix86_expand_vector_init_duplicate (TARGET_MMX_WITH_SSE,
+					  cmp_mode, idxv, idx_tmp);
   gcc_assert (ok);
   vec[0] = target;
   vec[1] = valv;
@@ -14656,23 +15183,25 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
   machine_mode half_mode;
   bool use_vec_merge = false;
   rtx tmp;
-  static rtx (*gen_extract[6][2]) (rtx, rtx)
+  static rtx (*gen_extract[7][2]) (rtx, rtx)
     = {
 	{ gen_vec_extract_lo_v32qi, gen_vec_extract_hi_v32qi },
 	{ gen_vec_extract_lo_v16hi, gen_vec_extract_hi_v16hi },
 	{ gen_vec_extract_lo_v8si, gen_vec_extract_hi_v8si },
 	{ gen_vec_extract_lo_v4di, gen_vec_extract_hi_v4di },
 	{ gen_vec_extract_lo_v8sf, gen_vec_extract_hi_v8sf },
-	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df }
+	{ gen_vec_extract_lo_v4df, gen_vec_extract_hi_v4df },
+	{ gen_vec_extract_lo_v16hf, gen_vec_extract_hi_v16hf }
       };
-  static rtx (*gen_insert[6][2]) (rtx, rtx, rtx)
+  static rtx (*gen_insert[7][2]) (rtx, rtx, rtx)
     = {
 	{ gen_vec_set_lo_v32qi, gen_vec_set_hi_v32qi },
 	{ gen_vec_set_lo_v16hi, gen_vec_set_hi_v16hi },
 	{ gen_vec_set_lo_v8si, gen_vec_set_hi_v8si },
 	{ gen_vec_set_lo_v4di, gen_vec_set_hi_v4di },
 	{ gen_vec_set_lo_v8sf, gen_vec_set_hi_v8sf },
-	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df }
+	{ gen_vec_set_lo_v4df, gen_vec_set_hi_v4df },
+	{ gen_vec_set_lo_v16hf, gen_vec_set_hi_v16hf },
       };
   int i, j, n;
   machine_mode mmode = VOIDmode;
@@ -14839,7 +15368,12 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
 	}
       return;
 
+    case E_V8HFmode:
+      use_vec_merge = true;
+      break;
+
     case E_V8HImode:
+    case E_V2HImode:
       use_vec_merge = TARGET_SSE2;
       break;
     case E_V4HImode:
@@ -14847,6 +15381,7 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
       break;
 
     case E_V16QImode:
+    case E_V4QImode:
       use_vec_merge = TARGET_SSE4_1;
       break;
 
@@ -14860,6 +15395,12 @@ ix86_expand_vector_set (bool mmx_ok, rtx target, rtx val, int elt)
       n = 16;
       goto half;
 
+    case E_V16HFmode:
+      half_mode = V8HFmode;
+      j = 6;
+      n = 8;
+      goto half;
+
     case E_V16HImode:
       half_mode = V8HImode;
       j = 1;
@@ -14940,6 +15481,13 @@ half:
 	}
       break;
 
+    case E_V32HFmode:
+      if (TARGET_AVX512BW)
+	{
+	  mmode = SImode;
+	  gen_blendm = gen_avx512bw_blendmv32hf;
+	}
+      break;
     case E_V32HImode:
       if (TARGET_AVX512BW)
 	{
@@ -15146,6 +15694,7 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
       break;
 
     case E_V8HImode:
+    case E_V2HImode:
       use_vec_extr = TARGET_SSE2;
       break;
     case E_V4HImode:
@@ -15166,6 +15715,9 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
 	  return;
 	}
       break;
+    case E_V4QImode:
+      use_vec_extr = TARGET_SSE4_1;
+      break;
 
     case E_V8SFmode:
       if (TARGET_AVX)
@@ -15307,6 +15859,28 @@ ix86_expand_vector_extract (bool mmx_ok, rtx target, rtx vec, int elt)
       ix86_expand_vector_extract (false, target, tmp, elt & 3);
       return;
 
+    case E_V32HFmode:
+      tmp = gen_reg_rtx (V16HFmode);
+      if (elt < 16)
+	emit_insn (gen_vec_extract_lo_v32hf (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v32hf (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 15);
+      return;
+
+    case E_V16HFmode:
+      tmp = gen_reg_rtx (V8HFmode);
+      if (elt < 8)
+	emit_insn (gen_vec_extract_lo_v16hf (tmp, vec));
+      else
+	emit_insn (gen_vec_extract_hi_v16hf (tmp, vec));
+      ix86_expand_vector_extract (false, target, tmp, elt & 7);
+      return;
+
+    case E_V8HFmode:
+      use_vec_extr = true;
+      break;
+
     case E_V8QImode:
       use_vec_extr = TARGET_MMX_WITH_SSE && TARGET_SSE4_1;
       /* ??? Could extract the appropriate HImode element and shift.  */
@@ -16902,7 +17476,9 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
     ;
   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
     ;
-  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+  else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
+			     || GET_MODE_SIZE (vmode) == 8
+			     || GET_MODE_SIZE (vmode) == 4))
     ;
   else
     return false;
@@ -16935,6 +17511,7 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
     case E_V8SFmode:
     case E_V2DFmode:
     case E_V4SFmode:
+    case E_V4HImode:
     case E_V8HImode:
     case E_V8SImode:
     case E_V32HImode:
@@ -16951,6 +17528,12 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
       vmode = V8HImode;
       goto do_subreg;
 
+    case E_V2SImode:
+      for (i = 0; i < 2; ++i)
+	mask |= (d->perm[i] >= 2 ? 3 : 0) << (i * 2);
+      vmode = V4HImode;
+      goto do_subreg;
+
     case E_V4SImode:
       for (i = 0; i < 4; ++i)
 	mask |= (d->perm[i] >= 4 ? 3 : 0) << (i * 2);
@@ -16972,7 +17555,11 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
 	    vperm = gen_rtx_CONST_VECTOR (vmode, gen_rtvec_v (nelt, rperm));
 	    vperm = force_reg (vmode, vperm);
 
-	    if (GET_MODE_SIZE (vmode) == 16)
+	    if (GET_MODE_SIZE (vmode) == 4)
+	      emit_insn (gen_mmx_pblendvb32 (target, op0, op1, vperm));
+	    else if (GET_MODE_SIZE (vmode) == 8)
+	      emit_insn (gen_mmx_pblendvb64 (target, op0, op1, vperm));
+	    else if (GET_MODE_SIZE (vmode) == 16)
 	      emit_insn (gen_sse4_1_pblendvb (target, op0, op1, vperm));
 	    else
 	      emit_insn (gen_avx2_pblendvb (target, op0, op1, vperm));
@@ -16992,6 +17579,26 @@ expand_vec_perm_blend (struct expand_vec_perm_d *d)
       op1 = gen_lowpart (vmode, op1);
       break;
 
+    case E_V8QImode:
+      for (i = 0; i < 8; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  goto use_pblendvb;
+
+      for (i = 0; i < 4; ++i)
+	mask |= (d->perm[i * 2] >= 8) << i;
+      vmode = V4HImode;
+      goto do_subreg;
+
+    case E_V4QImode:
+      for (i = 0; i < 4; i += 2)
+	if (d->perm[i] + 1 != d->perm[i + 1])
+	  goto use_pblendvb;
+
+      for (i = 0; i < 2; ++i)
+	mask |= (d->perm[i * 2] >= 4) << i;
+      vmode = V2HImode;
+      goto do_subreg;
+
     case E_V32QImode:
       /* See if bytes move in pairs.  If not, vpblendvb must be used.  */
       for (i = 0; i < 32; i += 2)
@@ -17153,6 +17760,59 @@ expand_vec_perm_vpermil (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* For V*[QHS]Imode permutations, check if the same permutation
+   can't be performed in a 2x, 4x or 8x wider inner mode.  */
+
+static bool
+canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
+			      struct expand_vec_perm_d *nd)
+{
+  int i;
+  machine_mode mode = VOIDmode;
+
+  switch (d->vmode)
+    {
+    case E_V8QImode: mode = V4HImode; break;
+    case E_V16QImode: mode = V8HImode; break;
+    case E_V32QImode: mode = V16HImode; break;
+    case E_V64QImode: mode = V32HImode; break;
+    case E_V4HImode: mode = V2SImode; break;
+    case E_V8HImode: mode = V4SImode; break;
+    case E_V16HImode: mode = V8SImode; break;
+    case E_V32HImode: mode = V16SImode; break;
+    case E_V4SImode: mode = V2DImode; break;
+    case E_V8SImode: mode = V4DImode; break;
+    case E_V16SImode: mode = V8DImode; break;
+    default: return false;
+    }
+  for (i = 0; i < d->nelt; i += 2)
+    if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
+      return false;
+  nd->vmode = mode;
+  nd->nelt = d->nelt / 2;
+  for (i = 0; i < nd->nelt; i++)
+    nd->perm[i] = d->perm[2 * i] / 2;
+  if (GET_MODE_INNER (mode) != DImode)
+    canonicalize_vector_int_perm (nd, nd);
+  if (nd != d)
+    {
+      nd->one_operand_p = d->one_operand_p;
+      nd->testing_p = d->testing_p;
+      if (d->op0 == d->op1)
+	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
+      else
+	{
+	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
+	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
+	}
+      if (d->testing_p)
+	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
+      else
+	nd->target = gen_reg_rtx (nd->vmode);
+    }
+  return true;
+}
+
 /* Return true if permutation D can be performed as VMODE permutation
    instead.  */
 
@@ -17190,151 +17850,193 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
   unsigned i, nelt, eltsz, mask;
   unsigned char perm[64];
   machine_mode vmode = V16QImode;
+  struct expand_vec_perm_d nd;
   rtx rperm[64], vperm, target, op0, op1;
 
   nelt = d->nelt;
 
   if (!d->one_operand_p)
-    {
-      if (!TARGET_XOP || GET_MODE_SIZE (d->vmode) != 16)
-	{
-	  if (TARGET_AVX2
-	      && valid_perm_using_mode_p (V2TImode, d))
-	    {
-	      if (d->testing_p)
-		return true;
-
-	      /* Use vperm2i128 insn.  The pattern uses
-		 V4DImode instead of V2TImode.  */
-	      target = d->target;
-	      if (d->vmode != V4DImode)
-		target = gen_reg_rtx (V4DImode);
-	      op0 = gen_lowpart (V4DImode, d->op0);
-	      op1 = gen_lowpart (V4DImode, d->op1);
-	      rperm[0]
-		= GEN_INT ((d->perm[0] / (nelt / 2))
-			   | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
-	      emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
-	      if (target != d->target)
-		emit_move_insn (d->target, gen_lowpart (d->vmode, target));
-	      return true;
-	    }
+    switch (GET_MODE_SIZE (d->vmode))
+      {
+      case 4:
+	if (!TARGET_XOP)
 	  return false;
-	}
-    }
-  else
-    {
-      if (GET_MODE_SIZE (d->vmode) == 16)
-	{
-	  if (!TARGET_SSSE3)
-	    return false;
-	}
-      else if (GET_MODE_SIZE (d->vmode) == 32)
-	{
-	  if (!TARGET_AVX2)
-	    return false;
-
-	  /* V4DImode should be already handled through
-	     expand_vselect by vpermq instruction.  */
-	  gcc_assert (d->vmode != V4DImode);
+	vmode = V4QImode;
+	break;
 
-	  vmode = V32QImode;
-	  if (d->vmode == V8SImode
-	      || d->vmode == V16HImode
-	      || d->vmode == V32QImode)
-	    {
-	      /* First see if vpermq can be used for
-		 V8SImode/V16HImode/V32QImode.  */
-	      if (valid_perm_using_mode_p (V4DImode, d))
-		{
-		  for (i = 0; i < 4; i++)
-		    perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
-		  if (d->testing_p)
-		    return true;
-		  target = gen_reg_rtx (V4DImode);
-		  if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
-				      perm, 4, false))
-		    {
-		      emit_move_insn (d->target,
-				      gen_lowpart (d->vmode, target));
-		      return true;
-		    }
-		  return false;
-		}
+      case 8:
+	if (!TARGET_XOP)
+	  return false;
+	vmode = V8QImode;
+	break;
 
-	      /* Next see if vpermd can be used.  */
-	      if (valid_perm_using_mode_p (V8SImode, d))
-		vmode = V8SImode;
-	    }
-	  /* Or if vpermps can be used.  */
-	  else if (d->vmode == V8SFmode)
-	    vmode = V8SImode;
+      case 16:
+	if (!TARGET_XOP)
+	  return false;
+	break;
 
-	  if (vmode == V32QImode)
-	    {
-	      /* vpshufb only works intra lanes, it is not
-		 possible to shuffle bytes in between the lanes.  */
-	      for (i = 0; i < nelt; ++i)
-		if ((d->perm[i] ^ i) & (nelt / 2))
-		  return false;
-	    }
-	}
-      else if (GET_MODE_SIZE (d->vmode) == 64)
-	{
-	  if (!TARGET_AVX512BW)
-	    return false;
+      case 32:
+	if (!TARGET_AVX2)
+	  return false;
 
-	  /* If vpermq didn't work, vpshufb won't work either.  */
-	  if (d->vmode == V8DFmode || d->vmode == V8DImode)
-	    return false;
+	if (valid_perm_using_mode_p (V2TImode, d))
+	  {
+	    if (d->testing_p)
+	      return true;
 
-	  vmode = V64QImode;
-	  if (d->vmode == V16SImode
-	      || d->vmode == V32HImode
-	      || d->vmode == V64QImode)
-	    {
-	      /* First see if vpermq can be used for
-		 V16SImode/V32HImode/V64QImode.  */
-	      if (valid_perm_using_mode_p (V8DImode, d))
-		{
-		  for (i = 0; i < 8; i++)
-		    perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
-		  if (d->testing_p)
+	    /* Use vperm2i128 insn.  The pattern uses
+	       V4DImode instead of V2TImode.  */
+	    target = d->target;
+	    if (d->vmode != V4DImode)
+	      target = gen_reg_rtx (V4DImode);
+	    op0 = gen_lowpart (V4DImode, d->op0);
+	    op1 = gen_lowpart (V4DImode, d->op1);
+	    rperm[0]
+	      = GEN_INT ((d->perm[0] / (nelt / 2))
+			 | ((d->perm[nelt / 2] / (nelt / 2)) * 16));
+	    emit_insn (gen_avx2_permv2ti (target, op0, op1, rperm[0]));
+	    if (target != d->target)
+	      emit_move_insn (d->target, gen_lowpart (d->vmode, target));
+	    return true;
+	  }
+	/* FALLTHRU */
+
+      default:
+	return false;
+      }
+  else
+    switch (GET_MODE_SIZE (d->vmode))
+      {
+      case 4:
+	if (!TARGET_SSSE3)
+	  return false;
+	vmode = V4QImode;
+	break;
+
+      case 8:
+	if (!TARGET_SSSE3)
+	  return false;
+	vmode = V8QImode;
+	break;
+
+      case 16:
+	if (!TARGET_SSSE3)
+	  return false;
+	break;
+
+      case 32:
+	if (!TARGET_AVX2)
+	  return false;
+
+	/* V4DImode should be already handled through
+	   expand_vselect by vpermq instruction.  */
+	gcc_assert (d->vmode != V4DImode);
+
+	vmode = V32QImode;
+	if (d->vmode == V8SImode
+	    || d->vmode == V16HImode
+	    || d->vmode == V32QImode)
+	  {
+	    /* First see if vpermq can be used for
+	       V8SImode/V16HImode/V32QImode.  */
+	    if (valid_perm_using_mode_p (V4DImode, d))
+	      {
+		for (i = 0; i < 4; i++)
+		  perm[i] = (d->perm[i * nelt / 4] * 4 / nelt) & 3;
+		if (d->testing_p)
+		  return true;
+		target = gen_reg_rtx (V4DImode);
+		if (expand_vselect (target, gen_lowpart (V4DImode, d->op0),
+				    perm, 4, false))
+		  {
+		    emit_move_insn (d->target,
+				    gen_lowpart (d->vmode, target));
 		    return true;
-		  target = gen_reg_rtx (V8DImode);
-		  if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
-				      perm, 8, false))
-		    {
-		      emit_move_insn (d->target,
-				      gen_lowpart (d->vmode, target));
-		      return true;
-		    }
-		  return false;
-		}
+		  }
+		return false;
+	      }
 
-	      /* Next see if vpermd can be used.  */
-	      if (valid_perm_using_mode_p (V16SImode, d))
-		vmode = V16SImode;
-	    }
-	  /* Or if vpermps can be used.  */
-	  else if (d->vmode == V16SFmode)
-	    vmode = V16SImode;
-	  if (vmode == V64QImode)
-	    {
-	      /* vpshufb only works intra lanes, it is not
-		 possible to shuffle bytes in between the lanes.  */
-	      for (i = 0; i < nelt; ++i)
-		if ((d->perm[i] ^ i) & (3 * nelt / 4))
-		  return false;
-	    }
-	}
-      else
+	    /* Next see if vpermd can be used.  */
+	    if (valid_perm_using_mode_p (V8SImode, d))
+	      vmode = V8SImode;
+	  }
+	/* Or if vpermps can be used.  */
+	else if (d->vmode == V8SFmode)
+	  vmode = V8SImode;
+
+	if (vmode == V32QImode)
+	  {
+	    /* vpshufb only works intra lanes, it is not
+	       possible to shuffle bytes in between the lanes.  */
+	    for (i = 0; i < nelt; ++i)
+	      if ((d->perm[i] ^ i) & (nelt / 2))
+		return false;
+	  }
+	break;
+
+      case 64:
+	if (!TARGET_AVX512BW)
+	  return false;
+
+	/* If vpermq didn't work, vpshufb won't work either.  */
+	if (d->vmode == V8DFmode || d->vmode == V8DImode)
+	  return false;
+
+	vmode = V64QImode;
+	if (d->vmode == V16SImode
+	    || d->vmode == V32HImode
+	    || d->vmode == V64QImode)
+	  {
+	    /* First see if vpermq can be used for
+	       V16SImode/V32HImode/V64QImode.  */
+	    if (valid_perm_using_mode_p (V8DImode, d))
+	      {
+		for (i = 0; i < 8; i++)
+		  perm[i] = (d->perm[i * nelt / 8] * 8 / nelt) & 7;
+		if (d->testing_p)
+		  return true;
+		target = gen_reg_rtx (V8DImode);
+		if (expand_vselect (target, gen_lowpart (V8DImode, d->op0),
+				    perm, 8, false))
+		  {
+		    emit_move_insn (d->target,
+				    gen_lowpart (d->vmode, target));
+		    return true;
+		  }
+		return false;
+	      }
+
+	    /* Next see if vpermd can be used.  */
+	    if (valid_perm_using_mode_p (V16SImode, d))
+	      vmode = V16SImode;
+	  }
+	/* Or if vpermps can be used.  */
+	else if (d->vmode == V16SFmode)
+	  vmode = V16SImode;
+	if (vmode == V64QImode)
+	  {
+	    /* vpshufb only works intra lanes, it is not
+	       possible to shuffle bytes in between the lanes.  */
+	    for (i = 0; i < nelt; ++i)
+	      if ((d->perm[i] ^ i) & (3 * nelt / 4))
+		return false;
+	  }
+	break;
+
+      default:
 	return false;
-    }
+      }
 
   if (d->testing_p)
     return true;
 
+  /* Try to avoid variable permutation instruction.  */
+  if (canonicalize_vector_int_perm (d, &nd) && expand_vec_perm_1 (&nd))
+    {
+      emit_move_insn (d->target, gen_lowpart (d->vmode, nd.target));
+      return true;
+    }
+
   if (vmode == V8SImode)
     for (i = 0; i < 8; ++i)
       rperm[i] = GEN_INT ((d->perm[i * nelt / 8] * 8 / nelt) & 7);
@@ -17346,12 +18048,12 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
       eltsz = GET_MODE_UNIT_SIZE (d->vmode);
       if (!d->one_operand_p)
 	mask = 2 * nelt - 1;
-      else if (vmode == V16QImode)
-	mask = nelt - 1;
       else if (vmode == V64QImode)
 	mask = nelt / 4 - 1;
-      else
+      else if (vmode == V32QImode)
 	mask = nelt / 2 - 1;
+      else
+	mask = nelt - 1;
 
       for (i = 0; i < nelt; ++i)
 	{
@@ -17361,95 +18063,98 @@ expand_vec_perm_pshufb (struct expand_vec_perm_d *d)
 	}
     }
 
-  vperm = gen_rtx_CONST_VECTOR (vmode,
-				gen_rtvec_v (GET_MODE_NUNITS (vmode), rperm));
-  vperm = force_reg (vmode, vperm);
+  machine_mode vpmode = vmode;
 
-  target = d->target;
-  if (d->vmode != vmode)
+  if (vmode == V4QImode
+      || vmode == V8QImode)
+    {
+      rtx m128 = GEN_INT (-128);
+
+      /* Remap elements from the second operand, as we have to
+	 account for inactive top elements from the first operand.  */
+      if (!d->one_operand_p)
+	{
+	  int sz = GET_MODE_SIZE (vmode);
+
+	  for (i = 0; i < nelt; ++i)
+	    {
+	      int ival = INTVAL (rperm[i]);
+	      if (ival >= sz)
+		ival += 16-sz;
+	      rperm[i] = GEN_INT (ival);
+	    }
+	}
+
+      /* V4QI/V8QI is emulated with V16QI instruction, fill inactive
+	 elements in the top positions with zeros.  */
+      for (i = nelt; i < 16; ++i)
+	rperm[i] = m128;
+
+      vpmode = V16QImode;
+    }
+
+  vperm = gen_rtx_CONST_VECTOR (vpmode,
+				gen_rtvec_v (GET_MODE_NUNITS (vpmode), rperm));
+  vperm = force_reg (vpmode, vperm);
+
+  if (vmode == d->vmode)
+    target = d->target;
+  else
     target = gen_reg_rtx (vmode);
+
   op0 = gen_lowpart (vmode, d->op0);
+
   if (d->one_operand_p)
     {
-      if (vmode == V16QImode)
-	emit_insn (gen_ssse3_pshufbv16qi3 (target, op0, vperm));
+      rtx (*gen) (rtx, rtx, rtx);
+
+      if (vmode == V4QImode)
+	gen = gen_mmx_pshufbv4qi3;
+      else if (vmode == V8QImode)
+	gen = gen_mmx_pshufbv8qi3;
+      else if (vmode == V16QImode)
+	gen = gen_ssse3_pshufbv16qi3;
       else if (vmode == V32QImode)
-	emit_insn (gen_avx2_pshufbv32qi3 (target, op0, vperm));
+	gen = gen_avx2_pshufbv32qi3;
       else if (vmode == V64QImode)
-	emit_insn (gen_avx512bw_pshufbv64qi3 (target, op0, vperm));
+	gen = gen_avx512bw_pshufbv64qi3;
       else if (vmode == V8SFmode)
-	emit_insn (gen_avx2_permvarv8sf (target, op0, vperm));
+	gen = gen_avx2_permvarv8sf;
       else if (vmode == V8SImode)
-	emit_insn (gen_avx2_permvarv8si (target, op0, vperm));
+	gen = gen_avx2_permvarv8si;
       else if (vmode == V16SFmode)
-	emit_insn (gen_avx512f_permvarv16sf (target, op0, vperm));
+	gen = gen_avx512f_permvarv16sf;
       else if (vmode == V16SImode)
-	emit_insn (gen_avx512f_permvarv16si (target, op0, vperm));
+	gen = gen_avx512f_permvarv16si;
       else
 	gcc_unreachable ();
+
+      emit_insn (gen (target, op0, vperm));
     }
   else
     {
+      rtx (*gen) (rtx, rtx, rtx, rtx);
+
       op1 = gen_lowpart (vmode, d->op1);
-      emit_insn (gen_xop_pperm (target, op0, op1, vperm));
+
+      if (vmode == V4QImode)
+	gen = gen_mmx_ppermv32;
+      else if (vmode == V8QImode)
+	gen = gen_mmx_ppermv64;
+      else if (vmode == V16QImode)
+	gen = gen_xop_pperm;
+      else
+	gcc_unreachable ();
+
+      emit_insn (gen (target, op0, op1, vperm));
     }
+
   if (target != d->target)
     emit_move_insn (d->target, gen_lowpart (d->vmode, target));
 
   return true;
 }
 
-/* For V*[QHS]Imode permutations, check if the same permutation
-   can't be performed in a 2x, 4x or 8x wider inner mode.  */
-
-static bool
-canonicalize_vector_int_perm (const struct expand_vec_perm_d *d,
-			      struct expand_vec_perm_d *nd)
-{
-  int i;
-  machine_mode mode = VOIDmode;
-
-  switch (d->vmode)
-    {
-    case E_V16QImode: mode = V8HImode; break;
-    case E_V32QImode: mode = V16HImode; break;
-    case E_V64QImode: mode = V32HImode; break;
-    case E_V8HImode: mode = V4SImode; break;
-    case E_V16HImode: mode = V8SImode; break;
-    case E_V32HImode: mode = V16SImode; break;
-    case E_V4SImode: mode = V2DImode; break;
-    case E_V8SImode: mode = V4DImode; break;
-    case E_V16SImode: mode = V8DImode; break;
-    default: return false;
-    }
-  for (i = 0; i < d->nelt; i += 2)
-    if ((d->perm[i] & 1) || d->perm[i + 1] != d->perm[i] + 1)
-      return false;
-  nd->vmode = mode;
-  nd->nelt = d->nelt / 2;
-  for (i = 0; i < nd->nelt; i++)
-    nd->perm[i] = d->perm[2 * i] / 2;
-  if (GET_MODE_INNER (mode) != DImode)
-    canonicalize_vector_int_perm (nd, nd);
-  if (nd != d)
-    {
-      nd->one_operand_p = d->one_operand_p;
-      nd->testing_p = d->testing_p;
-      if (d->op0 == d->op1)
-	nd->op0 = nd->op1 = gen_lowpart (nd->vmode, d->op0);
-      else
-	{
-	  nd->op0 = gen_lowpart (nd->vmode, d->op0);
-	  nd->op1 = gen_lowpart (nd->vmode, d->op1);
-	}
-      if (d->testing_p)
-	nd->target = gen_raw_REG (nd->vmode, LAST_VIRTUAL_REGISTER + 1);
-      else
-	nd->target = gen_reg_rtx (nd->vmode);
-    }
-  return true;
-}
-
 /* Try to expand one-operand permutation with constant mask.  */
 
 static bool
@@ -17457,6 +18162,7 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
 {
   machine_mode mode = GET_MODE (d->op0);
   machine_mode maskmode = mode;
+  unsigned inner_size = GET_MODE_SIZE (GET_MODE_INNER (mode));
   rtx (*gen) (rtx, rtx, rtx) = NULL;
   rtx target, op0, mask;
   rtx vec[64];
@@ -17467,6 +18173,18 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
   if (!TARGET_AVX512F)
     return false;
 
+  /* Accept VNxHImode and VNxQImode now.  */
+  if (!TARGET_AVX512VL && GET_MODE_SIZE (mode) < 64)
+    return false;
+
+  /* vpermw.  */
+  if (!TARGET_AVX512BW && inner_size == 2)
+    return false;
+
+  /* vpermb.  */
+  if (!TARGET_AVX512VBMI && inner_size == 1)
+    return false;
+
   switch (mode)
     {
     case E_V16SImode:
@@ -17483,10 +18201,32 @@ ix86_expand_vec_one_operand_perm_avx512 (struct expand_vec_perm_d *d)
       gen = gen_avx512f_permvarv8df;
       maskmode = V8DImode;
       break;
+    case E_V32HImode:
+      gen = gen_avx512bw_permvarv32hi;
+      break;
+    case E_V16HImode:
+      gen = gen_avx512vl_permvarv16hi;
+      break;
+    case E_V8HImode:
+      gen = gen_avx512vl_permvarv8hi;
+      break;
+    case E_V64QImode:
+      gen = gen_avx512bw_permvarv64qi;
+      break;
+    case E_V32QImode:
+      gen = gen_avx512vl_permvarv32qi;
+      break;
+    case E_V16QImode:
+      gen = gen_avx512vl_permvarv16qi;
+      break;
+
     default:
       return false;
     }
 
+  if (d->testing_p)
+    return true;
+
   target = d->target;
   op0 = d->op0;
   for (int i = 0; i < d->nelt; ++i)
@@ -17669,7 +18409,7 @@ expand_vec_perm_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_palignr (d, true))
     return true;
 
-  /* Try the AVX512F vperm{s,d} instructions.  */
+  /* Try the AVX512F vperm{w,b,s,d} instructions  */
   if (ix86_expand_vec_one_operand_perm_avx512 (d))
     return true;
 
@@ -17881,7 +18621,9 @@ expand_vec_perm_pblendv (struct expand_vec_perm_d *d)
     ;
   else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
     ;
-  else if (TARGET_SSE4_1 && GET_MODE_SIZE (vmode) == 16)
+  else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 4
+			     || GET_MODE_SIZE (vmode) == 8
+			     || GET_MODE_SIZE (vmode) == 16))
     ;
   else
     return false;
@@ -17960,7 +18702,9 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
   rtx_insn *seq;
   bool ok, same_halves = false;
 
-  if (GET_MODE_SIZE (d->vmode) == 16)
+  if (GET_MODE_SIZE (d->vmode) == 4
+      || GET_MODE_SIZE (d->vmode) == 8
+      || GET_MODE_SIZE (d->vmode) == 16)
     {
       if (d->one_operand_p)
 	return false;
@@ -17995,7 +18739,45 @@ expand_vec_perm_interleave2 (struct expand_vec_perm_d *d)
   memset (remap, 0xff, sizeof (remap));
   dremap = *d;
 
-  if (GET_MODE_SIZE (d->vmode) == 16)
+  if (GET_MODE_SIZE (d->vmode) == 4
+      || GET_MODE_SIZE (d->vmode) == 8)
+    {
+      unsigned HOST_WIDE_INT h1, h2, h3, h4;
+
+      /* Split the two input vectors into 4 halves.  */
+      h1 = (HOST_WIDE_INT_1U << nelt2) - 1;
+      h2 = h1 << nelt2;
+      h3 = h2 << nelt2;
+      h4 = h3 << nelt2;
+
+      /* If the elements from the low halves use interleave low,
+	 and similarly for interleave high.  */
+      if ((contents & (h1 | h3)) == contents)
+	{
+	  /* punpckl* */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i] = i * 2;
+	      remap[i + nelt] = i * 2 + 1;
+	      dremap.perm[i * 2] = i;
+	      dremap.perm[i * 2 + 1] = i + nelt;
+	    }
+	}
+      else if ((contents & (h2 | h4)) == contents)
+	{
+	  /* punpckh* */
+	  for (i = 0; i < nelt2; ++i)
+	    {
+	      remap[i + nelt2] = i * 2;
+	      remap[i + nelt + nelt2] = i * 2 + 1;
+	      dremap.perm[i * 2] = i + nelt2;
+	      dremap.perm[i * 2 + 1] = i + nelt + nelt2;
+	    }
+	}
+      else
+	return false;
+    }
+  else if (GET_MODE_SIZE (d->vmode) == 16)
     {
       unsigned HOST_WIDE_INT h1, h2, h3, h4;
 
@@ -18566,6 +19348,244 @@ expand_vec_perm_vperm2f128_vblend (struct expand_vec_perm_d *d)
   return true;
 }
 
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to implement
+   a two vector permutation using two single vector permutations and
+   {,v}{,p}unpckl{ps,pd,bw,wd,dq}.  If two_insn, succeed only if one
+   of dfirst or dsecond is identity permutation.  */
+
+static bool
+expand_vec_perm_2perm_interleave (struct expand_vec_perm_d *d, bool two_insn)
+{
+  unsigned i, nelt = d->nelt, nelt2 = nelt / 2, lane = nelt;
+  struct expand_vec_perm_d dfirst, dsecond, dfinal;
+  bool ident1 = true, ident2 = true;
+
+  if (d->one_operand_p)
+    return false;
+
+  if (GET_MODE_SIZE (d->vmode) == 16)
+    {
+      if (!TARGET_SSE)
+	return false;
+      if (d->vmode != V4SFmode && d->vmode != V2DFmode && !TARGET_SSE2)
+	return false;
+    }
+  else if (GET_MODE_SIZE (d->vmode) == 32)
+    {
+      if (!TARGET_AVX)
+	return false;
+      if (d->vmode != V8SFmode && d->vmode != V4DFmode && !TARGET_AVX2)
+	return false;
+      lane = nelt2;
+    }
+  else
+    return false;
+
+  for (i = 1; i < nelt; i++)
+    if ((d->perm[i] >= nelt) != ((d->perm[0] >= nelt) ^ (i & 1)))
+      return false;
+
+  dfirst = *d;
+  dsecond = *d;
+  dfinal = *d;
+  dfirst.op1 = dfirst.op0;
+  dfirst.one_operand_p = true;
+  dsecond.op0 = dsecond.op1;
+  dsecond.one_operand_p = true;
+
+  for (i = 0; i < nelt; i++)
+    if (d->perm[i] >= nelt)
+      {
+	dsecond.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i] - nelt;
+	if (d->perm[i] - nelt != i / 2 + (i >= lane ? lane / 2 : 0))
+	  ident2 = false;
+	dsecond.perm[i / 2 + (i >= lane ? lane : lane / 2)]
+	  = d->perm[i] - nelt;
+      }
+    else
+      {
+	dfirst.perm[i / 2 + (i >= lane ? lane / 2 : 0)] = d->perm[i];
+	if (d->perm[i] != i / 2 + (i >= lane ? lane / 2 : 0))
+	  ident1 = false;
+	dfirst.perm[i / 2 + (i >= lane ? lane : lane / 2)] = d->perm[i];
+      }
+
+  if (two_insn && !ident1 && !ident2)
+    return false;
+
+  if (!d->testing_p)
+    {
+      if (!ident1)
+	dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
+      if (!ident2)
+	dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
+      if (d->perm[0] >= nelt)
+	std::swap (dfinal.op0, dfinal.op1);
+    }
+
+  bool ok;
+  rtx_insn *seq1 = NULL, *seq2 = NULL;
+
+  if (!ident1)
+    {
+      start_sequence ();
+      ok = expand_vec_perm_1 (&dfirst);
+      seq1 = get_insns ();
+      end_sequence ();
+
+      if (!ok)
+	return false;
+    }
+
+  if (!ident2)
+    {
+      start_sequence ();
+      ok = expand_vec_perm_1 (&dsecond);
+      seq2 = get_insns ();
+      end_sequence ();
+
+      if (!ok)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  for (i = 0; i < nelt; i++)
+    {
+      dfinal.perm[i] = i / 2;
+      if (i >= lane)
+	dfinal.perm[i] += lane / 2;
+      if ((i & 1) != 0)
+	dfinal.perm[i] += nelt;
+    }
+  emit_insn (seq1);
+  emit_insn (seq2);
+  ok = expand_vselect_vconcat (dfinal.target, dfinal.op0, dfinal.op1,
+			       dfinal.perm, dfinal.nelt, false);
+  gcc_assert (ok);
+  return true;
+}
+
+/* A subroutine of ix86_expand_vec_perm_const_1.  Try to simplify
+   the permutation using two single vector permutations and the SSE4_1 pblendv
+   instruction.  If two_insn, succeed only if one of dfirst or dsecond is
+   identity permutation.  */
+
+static bool
+expand_vec_perm_2perm_pblendv (struct expand_vec_perm_d *d, bool two_insn)
+{
+  unsigned i, nelt = d->nelt;
+  struct expand_vec_perm_d dfirst, dsecond, dfinal;
+  machine_mode vmode = d->vmode;
+  bool ident1 = true, ident2 = true;
+
+  /* Use the same checks as in expand_vec_perm_blend.  */
+  if (d->one_operand_p)
+    return false;
+  if (TARGET_AVX2 && GET_MODE_SIZE (vmode) == 32)
+    ;
+  else if (TARGET_AVX && (vmode == V4DFmode || vmode == V8SFmode))
+    ;
+  else if (TARGET_SSE4_1 && (GET_MODE_SIZE (vmode) == 16
+			     || GET_MODE_SIZE (vmode) == 8
+			     || GET_MODE_SIZE (vmode) == 4))
+    ;
+  else
+    return false;
+
+  dfirst = *d;
+  dsecond = *d;
+  dfinal = *d;
+  dfirst.op1 = dfirst.op0;
+  dfirst.one_operand_p = true;
+  dsecond.op0 = dsecond.op1;
+  dsecond.one_operand_p = true;
+
+  for (i = 0; i < nelt; ++i)
+    if (d->perm[i] >= nelt)
+      {
+	dfirst.perm[i] = 0xff;
+	dsecond.perm[i] = d->perm[i] - nelt;
+	if (d->perm[i] != i + nelt)
+	  ident2 = false;
+      }
+    else
+      {
+	dsecond.perm[i] = 0xff;
+	dfirst.perm[i] = d->perm[i];
+	if (d->perm[i] != i)
+	  ident1 = false;
+      }
+
+  if (two_insn && !ident1 && !ident2)
+    return false;
+
+  /* For now.  Ideally treat 0xff as a wildcard.  */
+  for (i = 0; i < nelt; ++i)
+    if (dfirst.perm[i] == 0xff)
+      {
+	if (GET_MODE_SIZE (vmode) == 32
+	    && dfirst.perm[i ^ (nelt / 2)] != 0xff)
+	  dfirst.perm[i] = dfirst.perm[i ^ (nelt / 2)] ^ (nelt / 2);
+	else
+	  dfirst.perm[i] = i;
+      }
+    else
+      {
+	if (GET_MODE_SIZE (vmode) == 32
+	    && dsecond.perm[i ^ (nelt / 2)] != 0xff)
+	  dsecond.perm[i] = dsecond.perm[i ^ (nelt / 2)] ^ (nelt / 2);
+	else
+	  dsecond.perm[i] = i;
+      }
+
+  if (!d->testing_p)
+    {
+      if (!ident1)
+	dfinal.op0 = dfirst.target = gen_reg_rtx (d->vmode);
+      if (!ident2)
+	dfinal.op1 = dsecond.target = gen_reg_rtx (d->vmode);
+    }
+
+  bool ok;
+  rtx_insn *seq1 = NULL, *seq2 = NULL;
+
+  if (!ident1)
+    {
+      start_sequence ();
+      ok = expand_vec_perm_1 (&dfirst);
+      seq1 = get_insns ();
+      end_sequence ();
+
+      if (!ok)
+	return false;
+    }
+
+  if (!ident2)
+    {
+      start_sequence ();
+      ok = expand_vec_perm_1 (&dsecond);
+      seq2 = get_insns ();
+      end_sequence ();
+
+      if (!ok)
+	return false;
+    }
+
+  if (d->testing_p)
+    return true;
+
+  for (i = 0; i < nelt; ++i)
+    dfinal.perm[i] = (d->perm[i] >= nelt ? i + nelt : i);
+
+  emit_insn (seq1);
+  emit_insn (seq2);
+  ok = expand_vec_perm_blend (&dfinal);
+  gcc_assert (ok);
+  return true;
+}
+
 /* A subroutine of ix86_expand_vec_perm_const_1.  Implement a V4DF
    permutation using two vperm2f128, followed by a vshufpd insn blending
    the two vectors together.  */
@@ -18726,14 +19746,36 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
 {
   rtx rperm[2][16], vperm, l, h, op, m128;
   unsigned int i, nelt, eltsz;
+  machine_mode mode;
+  rtx (*gen) (rtx, rtx, rtx);
 
-  if (!TARGET_SSSE3 || GET_MODE_SIZE (d->vmode) != 16)
+  if (!TARGET_SSSE3 || (GET_MODE_SIZE (d->vmode) != 16
+			&& GET_MODE_SIZE (d->vmode) != 8
+			&& GET_MODE_SIZE (d->vmode) != 4))
     return false;
   gcc_assert (!d->one_operand_p);
 
   if (d->testing_p)
     return true;
 
+  switch (GET_MODE_SIZE (d->vmode))
+    {
+    case 4:
+      mode = V4QImode;
+      gen = gen_mmx_pshufbv4qi3;
+      break;
+    case 8:
+      mode = V8QImode;
+      gen = gen_mmx_pshufbv8qi3;
+      break;
+    case 16:
+      mode = V16QImode;
+      gen = gen_ssse3_pshufbv16qi3;
+      break;
+    default:
+      gcc_unreachable ();
+    }
+
   nelt = d->nelt;
   eltsz = GET_MODE_UNIT_SIZE (d->vmode);
 
@@ -18744,7 +19786,7 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
   m128 = GEN_INT (-128);
   for (i = 0; i < nelt; ++i)
     {
-      unsigned j, e = d->perm[i];
+      unsigned j, k, e = d->perm[i];
       unsigned which = (e >= nelt);
       if (e >= nelt)
 	e -= nelt;
@@ -18754,26 +19796,29 @@ expand_vec_perm_pshufb2 (struct expand_vec_perm_d *d)
 	  rperm[which][i*eltsz + j] = GEN_INT (e*eltsz + j);
 	  rperm[1-which][i*eltsz + j] = m128;
 	}
+
+      for (k = i*eltsz + j; k < 16; ++k)
+	rperm[0][k] = rperm[1][k] = m128;
     }
 
   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[0]));
   vperm = force_reg (V16QImode, vperm);
 
-  l = gen_reg_rtx (V16QImode);
-  op = gen_lowpart (V16QImode, d->op0);
-  emit_insn (gen_ssse3_pshufbv16qi3 (l, op, vperm));
+  l = gen_reg_rtx (mode);
+  op = gen_lowpart (mode, d->op0);
+  emit_insn (gen (l, op, vperm));
 
   vperm = gen_rtx_CONST_VECTOR (V16QImode, gen_rtvec_v (16, rperm[1]));
   vperm = force_reg (V16QImode, vperm);
 
-  h = gen_reg_rtx (V16QImode);
-  op = gen_lowpart (V16QImode, d->op1);
-  emit_insn (gen_ssse3_pshufbv16qi3 (h, op, vperm));
+  h = gen_reg_rtx (mode);
+  op = gen_lowpart (mode, d->op1);
+  emit_insn (gen (h, op, vperm));
 
   op = d->target;
-  if (d->vmode != V16QImode)
-    op = gen_reg_rtx (V16QImode);
-  emit_insn (gen_iorv16qi3 (op, l, h));
+  if (d->vmode != mode)
+    op = gen_reg_rtx (mode);
+  emit_insn (gen_rtx_SET (op, gen_rtx_IOR (mode, l, h)));
   if (op != d->target)
     emit_move_insn (d->target, gen_lowpart (d->vmode, op));
 
@@ -18932,9 +19977,9 @@ expand_vec_perm_vpshufb2_vpermq_even_odd (struct expand_vec_perm_d *d)
 }
 
 /* A subroutine of expand_vec_perm_even_odd_1.  Implement extract-even
-   and extract-odd permutations of two V16QI, V8HI, V16HI or V32QI operands
-   with two "and" and "pack" or two "shift" and "pack" insns.  We should
-   have already failed all two instruction sequences.  */
+   and extract-odd permutations of two V8QI, V8HI, V16QI, V16HI or V32QI
+   operands with two "and" and "pack" or two "shift" and "pack" insns.
+   We should have already failed all two instruction sequences.  */
 
 static bool
 expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
@@ -18952,6 +19997,17 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
 
   switch (d->vmode)
     {
+    case E_V4HImode:
+      /* Required for "pack".  */
+      if (!TARGET_SSE4_1)
+	return false;
+      c = 0xffff;
+      s = 16;
+      half_mode = V2SImode;
+      gen_and = gen_andv2si3;
+      gen_pack = gen_mmx_packusdw;
+      gen_shift = gen_lshrv2si3;
+      break;
     case E_V8HImode:
       /* Required for "pack".  */
       if (!TARGET_SSE4_1)
@@ -18963,6 +20019,15 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
       gen_pack = gen_sse4_1_packusdw;
       gen_shift = gen_lshrv4si3;
       break;
+    case E_V8QImode:
+      /* No check as all instructions are SSE2.  */
+      c = 0xff;
+      s = 8;
+      half_mode = V4HImode;
+      gen_and = gen_andv4hi3;
+      gen_pack = gen_mmx_packuswb;
+      gen_shift = gen_lshrv4hi3;
+      break;
     case E_V16QImode:
       /* No check as all instructions are SSE2.  */
       c = 0xff;
@@ -18995,8 +20060,8 @@ expand_vec_perm_even_odd_pack (struct expand_vec_perm_d *d)
       end_perm = true;
       break;
     default:
-      /* Only V8HI, V16QI, V16HI and V32QI modes are more profitable than
-	 general shuffles.  */
+      /* Only V4HI, V8QI, V8HI, V16QI, V16HI and V32QI modes
+	 are more profitable than general shuffles.  */
       return false;
     }
 
@@ -19174,6 +20239,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
     case E_V2DImode:
     case E_V2SImode:
     case E_V4SImode:
+    case E_V2HImode:
       /* These are always directly implementable by expand_vec_perm_1.  */
       gcc_unreachable ();
 
@@ -19184,19 +20250,46 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
 	return false;
       break;
 
+    case E_V4QImode:
+      if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+	return expand_vec_perm_pshufb2 (d);
+      else
+	{
+	  if (d->testing_p)
+	    break;
+	  /* We need 2*log2(N)-1 operations to achieve odd/even
+	     with interleave. */
+	  t1 = gen_reg_rtx (V4QImode);
+	  emit_insn (gen_mmx_punpckhbw_low (t1, d->op0, d->op1));
+	  emit_insn (gen_mmx_punpcklbw_low (d->target, d->op0, d->op1));
+	  if (odd)
+	    t2 = gen_mmx_punpckhbw_low (d->target, d->target, t1);
+	  else
+	    t2 = gen_mmx_punpcklbw_low (d->target, d->target, t1);
+	  emit_insn (t2);
+	}
+      break;
+
     case E_V4HImode:
-      if (d->testing_p)
-	break;
-      /* We need 2*log2(N)-1 operations to achieve odd/even
-	 with interleave. */
-      t1 = gen_reg_rtx (V4HImode);
-      emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
-      emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
-      if (odd)
-	t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
+      if (TARGET_SSE4_1)
+	return expand_vec_perm_even_odd_pack (d);
+      else if (TARGET_SSSE3 && !TARGET_SLOW_PSHUFB)
+	return expand_vec_perm_pshufb2 (d);
       else
-	t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
-      emit_insn (t2);
+	{
+	  if (d->testing_p)
+	    break;
+	  /* We need 2*log2(N)-1 operations to achieve odd/even
+	     with interleave. */
+	  t1 = gen_reg_rtx (V4HImode);
+	  emit_insn (gen_mmx_punpckhwd (t1, d->op0, d->op1));
+	  emit_insn (gen_mmx_punpcklwd (d->target, d->op0, d->op1));
+	  if (odd)
+	    t2 = gen_mmx_punpckhwd (d->target, d->target, t1);
+	  else
+	    t2 = gen_mmx_punpcklwd (d->target, d->target, t1);
+	  emit_insn (t2);
+	}
       break;
 
     case E_V8HImode:
@@ -19224,6 +20317,7 @@ expand_vec_perm_even_odd_1 (struct expand_vec_perm_d *d, unsigned odd)
 	}
       break;
 
+    case E_V8QImode:
     case E_V16QImode:
       return expand_vec_perm_even_odd_pack (d);
 
@@ -19354,6 +20448,11 @@ expand_vec_perm_even_odd (struct expand_vec_perm_d *d)
     if (d->perm[i] != 2 * i + odd)
       return false;
 
+  if (d->vmode == E_V32HImode
+      && d->testing_p
+      && !TARGET_AVX512BW)
+    return false;
+
   return expand_vec_perm_even_odd_1 (d, odd);
 }
 
@@ -19365,6 +20464,7 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
 {
   unsigned elt = d->perm[0], nelt2 = d->nelt / 2;
   machine_mode vmode = d->vmode;
+  rtx (*gen) (rtx, rtx, rtx);
   unsigned char perm2[4];
   rtx op0 = d->op0, dest;
   bool ok;
@@ -19384,9 +20484,70 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
     case E_V2DImode:
     case E_V2SImode:
     case E_V4SImode:
+    case E_V2HImode:
+    case E_V4HImode:
       /* These are always implementable using standard shuffle patterns.  */
       gcc_unreachable ();
 
+    case E_V4QImode:
+      /* This can be implemented via interleave and pshuflw.  */
+      if (d->testing_p)
+	return true;
+
+      if (elt >= nelt2)
+	{
+	  gen = gen_mmx_punpckhbw_low;
+	  elt -= nelt2;
+	}
+      else
+	gen = gen_mmx_punpcklbw_low;
+
+      dest = gen_reg_rtx (vmode);
+      emit_insn (gen (dest, op0, op0));
+      vmode = get_mode_wider_vector (vmode);
+      op0 = gen_lowpart (vmode, dest);
+
+      memset (perm2, elt, 2);
+      dest = gen_reg_rtx (vmode);
+      ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
+      gcc_assert (ok);
+
+      emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+      return true;
+
+    case E_V8QImode:
+      /* This can be implemented via interleave.  We save one insn by
+	 stopping once we have promoted to V2SImode and then use pshufd.  */
+      if (d->testing_p)
+	return true;
+      do
+	{
+	  if (elt >= nelt2)
+	    {
+	      gen = vmode == V8QImode ? gen_mmx_punpckhbw
+				      : gen_mmx_punpckhwd;
+	      elt -= nelt2;
+	    }
+	  else
+	    gen = vmode == V8QImode ? gen_mmx_punpcklbw
+				    : gen_mmx_punpcklwd;
+	  nelt2 /= 2;
+
+	  dest = gen_reg_rtx (vmode);
+	  emit_insn (gen (dest, op0, op0));
+	  vmode = get_mode_wider_vector (vmode);
+	  op0 = gen_lowpart (vmode, dest);
+	}
+      while (vmode != V2SImode);
+
+      memset (perm2, elt, 2);
+      dest = gen_reg_rtx (vmode);
+      ok = expand_vselect (dest, op0, perm2, 2, d->testing_p);
+      gcc_assert (ok);
+
+      emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+      return true;
+
     case E_V8HImode:
     case E_V16QImode:
       /* These can be implemented via interleave.  We save one insn by
@@ -19395,17 +20556,15 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
 	return true;
       do
 	{
-	  rtx dest;
-	  rtx (*gen) (rtx, rtx, rtx)
-	    = vmode == V16QImode ? gen_vec_interleave_lowv16qi
-				 : gen_vec_interleave_lowv8hi;
-
 	  if (elt >= nelt2)
 	    {
 	      gen = vmode == V16QImode ? gen_vec_interleave_highv16qi
 				       : gen_vec_interleave_highv8hi;
 	      elt -= nelt2;
 	    }
+	  else
+	    gen = vmode == V16QImode ? gen_vec_interleave_lowv16qi
+				     : gen_vec_interleave_lowv8hi;
 	  nelt2 /= 2;
 
 	  dest = gen_reg_rtx (vmode);
@@ -19416,14 +20575,13 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
       while (vmode != V4SImode);
 
       memset (perm2, elt, 4);
-      dest = gen_reg_rtx (V4SImode);
+      dest = gen_reg_rtx (vmode);
       ok = expand_vselect (dest, op0, perm2, 4, d->testing_p);
       gcc_assert (ok);
-      if (!d->testing_p)
-	emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
+
+      emit_move_insn (d->target, gen_lowpart (d->vmode, dest));
       return true;
 
-    case E_V64QImode:
     case E_V32QImode:
     case E_V16HImode:
     case E_V8SImode:
@@ -19433,6 +20591,14 @@ expand_vec_perm_broadcast_1 (struct expand_vec_perm_d *d)
       gcc_assert (!TARGET_AVX2 || d->perm[0]);
       return false;
 
+    case E_V64QImode:
+      gcc_assert (!TARGET_AVX512BW || d->perm[0]);
+      return false;
+
+    case E_V32HImode:
+      gcc_assert (!TARGET_AVX512BW);
+      return false;
+
     default:
       gcc_unreachable ();
     }
@@ -19677,6 +20843,12 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_pblendv (d))
     return true;
 
+  if (expand_vec_perm_2perm_interleave (d, true))
+    return true;
+
+  if (expand_vec_perm_2perm_pblendv (d, true))
+    return true;
+
   /* Try sequences of three instructions.  */
 
   if (expand_vec_perm_even_odd_pack (d))
@@ -19694,6 +20866,12 @@ ix86_expand_vec_perm_const_1 (struct expand_vec_perm_d *d)
   if (expand_vec_perm_vperm2f128_vblend (d))
     return true;
 
+  if (expand_vec_perm_2perm_interleave (d, false))
+    return true;
+
+  if (expand_vec_perm_2perm_pblendv (d, false))
+    return true;
+
   /* Try sequences of four instructions.  */
 
   if (expand_vec_perm_even_odd_trunc (d))
@@ -19822,16 +21000,16 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
 	return true;
       break;
     case E_V32HImode:
-      if (!TARGET_AVX512BW)
+      if (!TARGET_AVX512F)
 	return false;
-      if (d.testing_p)
+      if (d.testing_p && TARGET_AVX512BW)
 	/* All implementable with a single vperm[it]2 insn.  */
 	return true;
       break;
     case E_V64QImode:
-      if (!TARGET_AVX512BW)
+      if (!TARGET_AVX512F)
 	return false;
-      if (d.testing_p)
+      if (d.testing_p && TARGET_AVX512BW)
 	/* Implementable with 2 vperm[it]2, 2 vpshufb and 1 or insn.  */
 	return true;
       break;
@@ -19878,9 +21056,21 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
     case E_V2SFmode:
     case E_V2SImode:
     case E_V4HImode:
+    case E_V8QImode:
       if (!TARGET_MMX_WITH_SSE)
 	return false;
       break;
+    case E_V2HImode:
+      if (!TARGET_SSE2)
+	return false;
+      /* All implementable with *punpckwd.  */
+      if (d.testing_p)
+	return true;
+      break;
+    case E_V4QImode:
+      if (!TARGET_SSE2)
+	return false;
+      break;
     case E_V2DImode:
     case E_V2DFmode:
       if (!TARGET_SSE)
@@ -19912,10 +21102,11 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
       /* Check whether the mask can be applied to the vector type.  */
       d.one_operand_p = (which != 3);
 
-      /* Implementable with shufps or pshufd.  */
+      /* Implementable with shufps, pshufd or pshuflw.  */
       if (d.one_operand_p
 	  && (d.vmode == V4SFmode || d.vmode == V2SFmode
-	      || d.vmode == V4SImode || d.vmode == V2SImode))
+	      || d.vmode == V4SImode || d.vmode == V2SImode
+	      || d.vmode == V4HImode || d.vmode == V2HImode))
 	return true;
 
       /* Otherwise we have to go through the motions and see if we can
@@ -19934,6 +21125,32 @@ ix86_vectorize_vec_perm_const (machine_mode vmode, rtx target, rtx op0,
 
   two_args = canonicalize_perm (&d);
 
+  /* If one of the operands is a zero vector, try to match pmovzx.  */
+  if (two_args && (d.op0 == CONST0_RTX (vmode) || d.op1 == CONST0_RTX (vmode)))
+    {
+      struct expand_vec_perm_d dzero = d;
+      if (d.op0 == CONST0_RTX (vmode))
+	{
+	  d.op1 = dzero.op1 = force_reg (vmode, d.op1);
+	  std::swap (dzero.op0, dzero.op1);
+	  for (i = 0; i < nelt; ++i)
+	    dzero.perm[i] ^= nelt;
+	}
+      else
+	d.op0 = dzero.op0 = force_reg (vmode, d.op0);
+
+      if (expand_vselect_vconcat (dzero.target, dzero.op0, dzero.op1,
+				  dzero.perm, nelt, dzero.testing_p))
+	return true;
+    }
+
+  /* Force operands into registers.  */
+  rtx nop0 = force_reg (vmode, d.op0);
+  if (d.op0 == d.op1)
+    d.op1 = nop0;
+  d.op0 = nop0;
+  d.op1 = force_reg (vmode, d.op1);
+
   if (ix86_expand_vec_perm_const_1 (&d))
     return true;
 
@@ -20003,8 +21220,9 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
   gcc_assert (ok);
 }
 
-/* Optimize vector MUL generation for V8QI, V16QI and V32QI
-   under TARGET_AVX512BW. i.e. for v16qi a * b, it has
+/* This function is similar as ix86_expand_vecop_qihi,
+   but optimized under AVX512BW by using vpmovwb.
+   For example, optimize vector MUL generation like
 
    vpmovzxbw ymm2, xmm0
    vpmovzxbw ymm3, xmm1
@@ -20014,13 +21232,14 @@ ix86_expand_vec_interleave (rtx targ, rtx op0, rtx op1, bool high_p)
    it would take less instructions than ix86_expand_vecop_qihi.
    Return true if success.  */
 
-bool
-ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
+static bool
+ix86_expand_vecop_qihi2 (enum rtx_code code, rtx dest, rtx op1, rtx op2)
 {
   machine_mode himode, qimode = GET_MODE (dest);
   rtx hop1, hop2, hdest;
   rtx (*gen_extend)(rtx, rtx);
   rtx (*gen_truncate)(rtx, rtx);
+  bool uns_p = (code == ASHIFTRT) ? false : true;
 
   /* There's no V64HImode multiplication instruction.  */
   if (qimode == E_V64QImode)
@@ -20041,17 +21260,17 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
     {
     case E_V8QImode:
       himode = V8HImode;
-      gen_extend = gen_zero_extendv8qiv8hi2;
+      gen_extend = uns_p ? gen_zero_extendv8qiv8hi2 : gen_extendv8qiv8hi2;
       gen_truncate = gen_truncv8hiv8qi2;
       break;
     case E_V16QImode:
       himode = V16HImode;
-      gen_extend = gen_zero_extendv16qiv16hi2;
+      gen_extend = uns_p ? gen_zero_extendv16qiv16hi2 : gen_extendv16qiv16hi2;
       gen_truncate = gen_truncv16hiv16qi2;
       break;
     case E_V32QImode:
       himode = V32HImode;
-      gen_extend = gen_zero_extendv32qiv32hi2;
+      gen_extend = uns_p ? gen_zero_extendv32qiv32hi2 : gen_extendv32qiv32hi2;
       gen_truncate = gen_truncv32hiv32qi2;
       break;
     default:
@@ -20063,7 +21282,7 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
   hdest = gen_reg_rtx (himode);
   emit_insn (gen_extend (hop1, op1));
   emit_insn (gen_extend (hop2, op2));
-  emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (MULT, himode,
+  emit_insn (gen_rtx_SET (hdest, simplify_gen_binary (code, himode,
 						      hop1, hop2)));
   emit_insn (gen_truncate (dest, hdest));
   return true;
@@ -20071,8 +21290,9 @@ ix86_expand_vecmul_qihi (rtx dest, rtx op1, rtx op2)
 
 /* Expand a vector operation shift by constant for a V*QImode in terms of the
    same operation on V*HImode. Return true if success. */
-bool
-ix86_expand_vec_shift_qihi_constant (enum rtx_code code, rtx dest, rtx op1, rtx op2)
+static bool
+ix86_expand_vec_shift_qihi_constant (enum rtx_code code,
+				     rtx dest, rtx op1, rtx op2)
 {
   machine_mode qimode, himode;
   HOST_WIDE_INT and_constant, xor_constant;
@@ -20184,6 +21404,16 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
   bool uns_p = false;
   int i;
 
+  if (CONST_INT_P (op2)
+      && (code == ASHIFT || code == LSHIFTRT || code == ASHIFTRT)
+      && ix86_expand_vec_shift_qihi_constant (code, dest, op1, op2))
+    return;
+
+  if (TARGET_AVX512BW
+      && VECTOR_MODE_P (GET_MODE (op2))
+      && ix86_expand_vecop_qihi2 (code, dest, op1, op2))
+    return;
+
   switch (qimode)
     {
     case E_V16QImode:
@@ -20205,7 +21435,6 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
       gcc_unreachable ();
     }
 
-  op2_l = op2_h = op2;
   switch (code)
     {
     case MULT:
@@ -20234,17 +21463,46 @@ ix86_expand_vecop_qihi (enum rtx_code code, rtx dest, rtx op1, rtx op2)
       op1_h = gen_reg_rtx (himode);
       ix86_expand_sse_unpack (op1_l, op1, uns_p, false);
       ix86_expand_sse_unpack (op1_h, op1, uns_p, true);
+      /* vashr/vlshr/vashl  */
+      if (GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+	{
+	  rtx tmp = force_reg (qimode, op2);
+	  op2_l = gen_reg_rtx (himode);
+	  op2_h = gen_reg_rtx (himode);
+	  ix86_expand_sse_unpack (op2_l, tmp, uns_p, false);
+	  ix86_expand_sse_unpack (op2_h, tmp, uns_p, true);
+	}
+      else
+	op2_l = op2_h = op2;
+
       full_interleave = true;
       break;
     default:
       gcc_unreachable ();
     }
 
-  /* Perform the operation.  */
-  res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
-			       1, OPTAB_DIRECT);
-  res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
-			       1, OPTAB_DIRECT);
+  /* Perform vashr/vlshr/vashl.  */
+  if (code != MULT
+      && GET_MODE_CLASS (GET_MODE (op2)) == MODE_VECTOR_INT)
+    {
+      res_l = gen_reg_rtx (himode);
+      res_h = gen_reg_rtx (himode);
+      emit_insn (gen_rtx_SET (res_l,
+			      simplify_gen_binary (code, himode,
+						   op1_l, op2_l)));
+      emit_insn (gen_rtx_SET (res_h,
+			      simplify_gen_binary (code, himode,
+						   op1_h, op2_h)));
+    }
+  /* Performance mult/ashr/lshr/ashl.  */
+  else
+    {
+      res_l = expand_simple_binop (himode, code, op1_l, op2_l, NULL_RTX,
+				   1, OPTAB_DIRECT);
+      res_h = expand_simple_binop (himode, code, op1_h, op2_h, NULL_RTX,
+				   1, OPTAB_DIRECT);
+    }
+
   gcc_assert (res_l && res_h);
 
   /* Merge the data back into the right place.  */
-- 
cgit v1.2.3