arm: Add support for MVE Tail-Predicated Low Overhead Loops

This patch adds support for MVE Tail-Predicated Low Overhead Loops by using the doloop funcitonality added to support predicated vectorized hardware loops. gcc/ChangeLog: * config/arm/arm-protos.h (arm_target_bb_ok_for_lob): Change declaration to pass basic_block. (arm_attempt_dlstp_transform): New declaration. * config/arm/arm.cc (TARGET_LOOP_UNROLL_ADJUST): Define targethook. (TARGET_PREDICT_DOLOOP_P): Likewise. (arm_target_bb_ok_for_lob): Adapt condition. (arm_mve_get_vctp_lanes): New function. (arm_dl_usage_type): New internal enum. (arm_get_required_vpr_reg): New function. (arm_get_required_vpr_reg_param): New function. (arm_get_required_vpr_reg_ret_val): New function. (arm_mve_get_loop_vctp): New function. (arm_mve_insn_predicated_by): New function. (arm_mve_across_lane_insn_p): New function. (arm_mve_load_store_insn_p): New function. (arm_mve_impl_pred_on_outputs_p): New function. (arm_mve_impl_pred_on_inputs_p): New function. (arm_last_vect_def_insn): New function. (arm_mve_impl_predicated_p): New function. (arm_mve_check_reg_origin_is_num_elems): New function. (arm_mve_dlstp_check_inc_counter): New function. (arm_mve_dlstp_check_dec_counter): New function. (arm_mve_loop_valid_for_dlstp): New function. (arm_predict_doloop_p): New function. (arm_loop_unroll_adjust): New function. (arm_emit_mve_unpredicated_insn_to_seq): New function. (arm_attempt_dlstp_transform): New function. * config/arm/arm.opt (mdlstp): New option. * config/arm/iterators.md (dlstp_elemsize, letp_num_lanes, letp_num_lanes_neg, letp_num_lanes_minus_1): New attributes. (DLSTP, LETP): New iterators. * config/arm/mve.md (predicated_doloop_end_internal<letp_num_lanes>, dlstp<dlstp_elemsize>_insn): New insn patterns. * config/arm/thumb2.md (doloop_end): Adapt to support tail-predicated loops. (doloop_begin): Likewise. * config/arm/types.md (mve_misc): New mve type to represent predicated_loop_end insn sequences. * config/arm/unspecs.md: (DLSTP8, DLSTP16, DLSTP32, DSLTP64, LETP8, LETP16, LETP32, LETP64): New unspecs for DLSTP and LETP. gcc/testsuite/ChangeLog: * gcc.target/arm/lob.h: Add new helpers. * gcc.target/arm/lob1.c: Use new helpers. * gcc.target/arm/lob6.c: Likewise. * gcc.target/arm/mve/dlstp-compile-asm-1.c: New test. * gcc.target/arm/mve/dlstp-compile-asm-2.c: New test. * gcc.target/arm/mve/dlstp-compile-asm-3.c: New test. * gcc.target/arm/mve/dlstp-int8x16.c: New test. * gcc.target/arm/mve/dlstp-int8x16-run.c: New test. * gcc.target/arm/mve/dlstp-int16x8.c: New test. * gcc.target/arm/mve/dlstp-int16x8-run.c: New test. * gcc.target/arm/mve/dlstp-int32x4.c: New test. * gcc.target/arm/mve/dlstp-int32x4-run.c: New test. * gcc.target/arm/mve/dlstp-int64x2.c: New test. * gcc.target/arm/mve/dlstp-int64x2-run.c: New test. * gcc.target/arm/mve/dlstp-invalid-asm.c: New test. Co-authored-by: Stam Markianos-Wright <stam.markianos-wright@arm.com>
author: Andre Vieira <andre.simoesdiasvieira@arm.com> 2024-06-19 17:05:55 +0100
committer: Andre Vieira <andre.simoesdiasvieira@arm.com> 2024-06-19 17:11:17 +0100
commit: 3dfc28dbbd21b1d708aa40064380ef4c42c994d7 (patch)
tree: 340ee5cc055032837da2506ebaa808675f63eaad
parent: 5d0c1b4e0d33c2d1077264636d0a65ce206d0d96 (diff)
23 files changed, 3321 insertions, 82 deletions
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 2cd560c9925..34d6be76e94 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void);
 extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
 extern bool arm_q_bit_access (void);
 extern bool arm_ge_bits_access (void);
-extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern bool arm_target_bb_ok_for_lob (basic_block);
+extern int arm_attempt_dlstp_transform (rtx);
 #ifdef RTX_CODE
 enum reg_class
 arm_mode_base_reg_class (machine_mode);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index b8c32db0a1d..7d67d2cfee9 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -668,6 +668,12 @@ static const scoped_attribute_specs *const arm_attribute_table[] =
 #undef TARGET_HAVE_CONDITIONAL_EXECUTION
 #define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution
 
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust
+
+#undef TARGET_PREDICT_DOLOOP_P
+#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p
+
 #undef TARGET_LEGITIMATE_CONSTANT_P
 #define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p
 
@@ -34659,19 +34665,1236 @@ arm_invalid_within_doloop (const rtx_insn *insn)
 }
 
 bool
-arm_target_insn_ok_for_lob (rtx insn)
-{
-  basic_block bb = BLOCK_FOR_INSN (insn);
-  /* Make sure the basic block of the target insn is a simple latch
-     having as single predecessor and successor the body of the loop
-     itself.  Only simple loops with a single basic block as body are
-     supported for 'low over head loop' making sure that LE target is
-     above LE itself in the generated code.  */
-
-  return single_succ_p (bb)
-    && single_pred_p (bb)
-    && single_succ_edge (bb)->dest == single_pred_edge (bb)->src
-    && contains_no_active_insn_p (bb);
+arm_target_bb_ok_for_lob (basic_block bb)
+{
+  /* Make sure the basic block is a simple latch having as the single
+     predecessor and successor the body of the loop itself.
+     Only simple loops with a single basic block as body are supported for
+     low over head loops, making sure that LE target is above LE instruction
+     in the generated code.  */
+  return (single_succ_p (bb)
+	  && single_pred_p (bb)
+	  && single_succ_edge (bb)->dest == single_pred_edge (bb)->src);
+}
+
+/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE
+   lanes based on the machine mode being used.  */
+
+static int
+arm_mve_get_vctp_lanes (rtx_insn *insn)
+{
+  rtx insn_set = single_set (insn);
+  if (insn_set
+      && GET_CODE (SET_SRC (insn_set)) == UNSPEC
+      && (XINT (SET_SRC (insn_set), 1) == VCTP
+	  || XINT (SET_SRC (insn_set), 1) == VCTP_M))
+    {
+      machine_mode mode = GET_MODE (SET_SRC (insn_set));
+      return ((VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode))
+	      ? GET_MODE_NUNITS (mode) : 0);
+    }
+  return 0;
+}
+
+enum arm_dl_usage_type { DL_USAGE_ANY = 0,
+			 DL_USAGE_READ = 1,
+			 DL_USAGE_WRITE = 2 };
+
+/* Check if INSN requires the use of the VPR reg, if it does, return the
+   sub-rtx of the VPR reg.  The TYPE argument controls whether
+   this function should:
+   * For TYPE == DL_USAGE_ANY, check all operands, including the OUT operands,
+     and return the first occurrence of the VPR reg.
+   * For TYPE == DL_USAGE_READ, only check the input operands.
+   * For TYPE == DL_USAGE_WRITE, only check the output operands.
+   (INOUT operands are considered both as input and output operands)
+*/
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn,
+			  arm_dl_usage_type type = DL_USAGE_ANY)
+{
+  gcc_assert (type < 3);
+  if (!NONJUMP_INSN_P (insn))
+    return NULL_RTX;
+
+  bool requires_vpr;
+  extract_constrain_insn (insn);
+  int n_operands = recog_data.n_operands;
+  if (recog_data.n_alternatives == 0)
+    return NULL_RTX;
+
+  /* Fill in recog_op_alt with information about the constraints of
+     this insn.  */
+  preprocess_constraints (insn);
+
+  for (int op = 0; op < n_operands; op++)
+    {
+      requires_vpr = true;
+      if (type == DL_USAGE_READ
+	  && recog_data.operand_type[op] == OP_OUT)
+	continue;
+      else if (type == DL_USAGE_WRITE
+	       && recog_data.operand_type[op] == OP_IN)
+	continue;
+
+      /* Iterate through alternatives of operand "op" in recog_op_alt and
+	 identify if the operand is required to be the VPR.  */
+      for (int alt = 0; alt < recog_data.n_alternatives; alt++)
+	{
+	  const operand_alternative *op_alt
+	      = &recog_op_alt[alt * n_operands];
+	  /* Fetch the reg_class for each entry and check it against the
+	     VPR_REG reg_class.  */
+	  if (alternative_class (op_alt, op) != VPR_REG)
+	    requires_vpr = false;
+	}
+      /* If all alternatives of the insn require the VPR reg for this operand,
+	 it means that either this is VPR-generating instruction, like a vctp,
+	 vcmp, etc., or it is a VPT-predicated insruction.  Return the subrtx
+	 of the VPR reg operand.  */
+      if (requires_vpr)
+	return recog_data.operand[op];
+    }
+  return NULL_RTX;
+}
+
+/* Wrapper function of arm_get_required_vpr_reg with TYPE == DL_USAGE_READ,
+   so return the VPR only if it is an input operand to the insn.  */
+
+static rtx
+arm_get_required_vpr_reg_param (rtx_insn *insn)
+{
+  return arm_get_required_vpr_reg (insn, DL_USAGE_READ);
+}
+
+/* Wrapper function of arm_get_required_vpr_reg with TYPE == DL_USAGE_WRITE,
+   so return the VPR only if it is the return value, an output of, or is
+   clobbered by the insn.  */
+
+static rtx
+arm_get_required_vpr_reg_ret_val (rtx_insn *insn)
+{
+  return arm_get_required_vpr_reg (insn, DL_USAGE_WRITE);
+}
+
+/* Return the first VCTP instruction in BB, if it exists, or NULL otherwise.  */
+
+static rtx_insn *
+arm_mve_get_loop_vctp (basic_block bb)
+{
+  rtx_insn *insn = BB_HEAD (bb);
+
+  /* Now scan through all the instruction patterns and pick out the VCTP
+     instruction.  We require arm_get_required_vpr_reg_param to be false
+     to make sure we pick up a VCTP, rather than a VCTP_M.  */
+  FOR_BB_INSNS (bb, insn)
+    if (NONDEBUG_INSN_P (insn))
+      if (arm_get_required_vpr_reg_ret_val (insn)
+	  && (arm_mve_get_vctp_lanes (insn) != 0)
+	  && !arm_get_required_vpr_reg_param (insn))
+	return insn;
+  return NULL;
+}
+
+/* Return true if INSN is a MVE instruction that is VPT-predicable and is
+   predicated on VPR_REG.  */
+
+static bool
+arm_mve_insn_predicated_by (rtx_insn *insn, rtx vpr_reg)
+{
+  rtx insn_vpr_reg_operand = (MVE_VPT_PREDICATED_INSN_P (insn)
+			      ? arm_get_required_vpr_reg_param (insn)
+			      : NULL_RTX);
+  return (insn_vpr_reg_operand
+	  && rtx_equal_p (vpr_reg, insn_vpr_reg_operand));
+}
+
+/* Utility function to identify if INSN is an MVE instruction that performs
+   some across lane operation (and as a result does not align with normal
+   lane predication rules).  All such instructions give one only scalar
+   output, except for vshlcq which gives a PARALLEL of a vector and a scalar
+   (one vector result and one carry output).  */
+
+static bool
+arm_mve_across_lane_insn_p (rtx_insn* insn)
+{
+  df_ref insn_defs = NULL;
+  if (!MVE_VPT_PREDICABLE_INSN_P (insn))
+    return false;
+
+  FOR_EACH_INSN_DEF (insn_defs, insn)
+    if (!VALID_MVE_MODE (GET_MODE (DF_REF_REG (insn_defs)))
+	&& !arm_get_required_vpr_reg_ret_val (insn))
+      return true;
+
+  return false;
+}
+
+/* Utility function to identify if INSN is an MVE load or store instruction.
+   * For TYPE == DL_USAGE_ANY, check all operands.  If the function returns
+     true, INSN is a load or a store insn.
+   * For TYPE == DL_USAGE_READ, only check the input operands.  If the
+     function returns true, INSN is a load insn.
+   * For TYPE == DL_USAGE_WRITE, only check the output operands.  If the
+     function returns true, INSN is a store insn.  */
+
+static bool
+arm_mve_load_store_insn_p (rtx_insn* insn,
+			   arm_dl_usage_type type = DL_USAGE_ANY)
+{
+  gcc_assert (type < 3);
+  int n_operands = recog_data.n_operands;
+  extract_insn (insn);
+
+  for (int op = 0; op < n_operands; op++)
+    {
+      if (type == DL_USAGE_READ && recog_data.operand_type[op] == OP_OUT)
+	continue;
+      else if (type == DL_USAGE_WRITE && recog_data.operand_type[op] == OP_IN)
+	continue;
+      if (mve_memory_operand (recog_data.operand[op],
+			      GET_MODE (recog_data.operand[op])))
+	return true;
+    }
+  return false;
+}
+
+/* Return TRUE if INSN is validated for implicit predication by how its outputs
+   are used.
+
+   If INSN is a MVE operation across lanes that is not predicated by
+   VCTP_VPR_GENERATED it can not be validated by the use of its ouputs.
+
+   Any other INSN is safe to implicit predicate if we don't use its outputs
+   outside the loop.  The instructions that use this INSN's outputs will be
+   validated as we go through the analysis.  */
+
+static bool
+arm_mve_impl_pred_on_outputs_p (rtx_insn *insn, rtx vctp_vpr_generated)
+{
+  /* Reject any unpredicated across lane operation.  */
+  if (!arm_mve_insn_predicated_by (insn, vctp_vpr_generated)
+      && arm_mve_across_lane_insn_p (insn))
+    return false;
+
+  /* Next, scan forward to the various USEs of the DEFs in this insn.  */
+  df_ref insn_def = NULL;
+  basic_block insn_bb = BLOCK_FOR_INSN (insn);
+  FOR_EACH_INSN_DEF (insn_def, insn)
+    {
+      for (df_ref use = DF_REG_USE_CHAIN (DF_REF_REGNO (insn_def));
+	   use;
+	   use = DF_REF_NEXT_REG (use))
+	{
+	  rtx_insn *next_use_insn = DF_REF_INSN (use);
+	  if (!INSN_P (next_use_insn) || DEBUG_INSN_P (next_use_insn))
+	    continue;
+
+	  if (insn_bb != BLOCK_FOR_INSN (next_use_insn))
+	    return false;
+	}
+    }
+  return true;
+}
+
+
+/* Returns the prevailing definition of OP before CUR_INSN in the same
+   basic block as CUR_INSN, if one exists, returns NULL otherwise.  */
+
+static rtx_insn*
+arm_last_vect_def_insn (rtx op, rtx_insn *cur_insn)
+{
+  if (!REG_P (op)
+      || !BLOCK_FOR_INSN (cur_insn))
+    return NULL;
+
+  df_ref def_insns;
+  rtx_insn *last_def = NULL;
+  for (def_insns = DF_REG_DEF_CHAIN (REGNO (op));
+       def_insns;
+       def_insns = DF_REF_NEXT_REG (def_insns))
+    {
+      rtx_insn *def_insn = DF_REF_INSN (def_insns);
+      /* Definition not in the loop body or after the current insn.  */
+      if (DF_REF_BB (def_insns) != BLOCK_FOR_INSN (cur_insn)
+	  || INSN_UID (def_insn) >= INSN_UID (cur_insn))
+	continue;
+
+      if (!last_def || INSN_UID (def_insn) > INSN_UID (last_def))
+	last_def = def_insn;
+    }
+  return last_def;
+}
+
+
+/* This function returns TRUE if we can validate the implicit predication of
+   INSN_IN with VCTP_VPR_GENERATED based on the definition of the instruction's
+   input operands.
+
+   If INSN_IN is a MVE operation across lanes then all of its MVE vector
+   operands must have its tail-predicated lanes be zeroes.  We keep track of any
+   instructions that define vector operands for which this is true in
+   PROPS_ZERO_SET.
+
+   For any other INSN_IN, the definition of all its operands must be defined
+   inside the loop body by an instruction that comes before INSN_IN and not be
+   a MVE load predicated by a different VPR.  These instructions have all been
+   validated for explicit or implicit predication.
+ */
+
+static bool
+arm_mve_impl_pred_on_inputs_p (vec <rtx_insn *> *props_zero_set,
+			       rtx_insn *insn_in, rtx vctp_vpr_generated)
+{
+  /* If all inputs come from instructions that are explicitly or
+     implicitly predicated by the same predicate then it is safe to
+     implicitly predicate this instruction.  */
+  df_ref insn_uses = NULL;
+  bool across_lane = arm_mve_across_lane_insn_p (insn_in);
+  FOR_EACH_INSN_USE (insn_uses, insn_in)
+  {
+    rtx op = DF_REF_REG (insn_uses);
+    rtx_insn *def_insn = arm_last_vect_def_insn (op, insn_in);
+    if (across_lane)
+      {
+	if (!VALID_MVE_MODE (GET_MODE (op)))
+	  continue;
+	if (!def_insn || !props_zero_set->contains (def_insn))
+	  return false;
+
+	continue;
+      }
+
+    if (!def_insn
+	|| (!arm_mve_insn_predicated_by (def_insn, vctp_vpr_generated)
+	    && arm_mve_load_store_insn_p (def_insn, DL_USAGE_READ)))
+      return false;
+  }
+
+  return true;
+}
+
+
+/* Determine whether INSN_IN is safe to implicitly predicate based on the type
+   of instruction and where needed the definition of its inputs and the uses of
+   its outputs.
+   Return TRUE if it is safe to implicitly predicate and FALSE otherwise.
+
+   * If INSN_IN is a store, then it is always unsafe to implicitly predicate it.
+   * If INSN_IN is a load, only reject implicit predication if its uses
+     directly invalidate it.
+   * If INSN_IN operates across vector lanes and does not have the
+     "mve_safe_imp_xlane_pred" attribute, then it is always unsafe to implicitly
+     predicate.
+   * If INSN_IN operates on Floating Point elements and we are not compiling
+     with -Ofast, then it is unsafe to implicitly predicate it as we may be
+     changing exception and cumulative bits behaviour.
+   * If INSN_IN is a VCTP instruction, then it is safe to implicitly predicate,
+     but instructions that use this predicate will need to be checked
+     just like any other UNPREDICATED MVE instruction.
+   * Otherwise check if INSN_IN's inputs or uses of outputs can validate its
+     implicit predication.
+
+   * If all inputs come from instructions that are explicitly or implicitly
+     predicated by the same predicate then it is safe to implicitly predicate
+     this instruction.
+   * If INSN_IN is an operation across lanes with the "mve_safe_imp_xlane_pred"
+     attribute, then all it's operands must have zeroed falsely predicated tail
+     lanes.
+
+   * Otherwise, check if the implicit predication of INSN_IN can be validated
+     based on its inputs, and if not check whether it can be validated based on
+     how its outputs are used.  */
+
+static bool
+arm_mve_impl_predicated_p (vec <rtx_insn *> *props_zero_set,
+			   rtx_insn *insn_in, rtx vctp_vpr_generated)
+{
+
+  /* If INSN_IN is a store, then it is always unsafe to implicitly
+     predicate it.  */
+  if (arm_mve_load_store_insn_p (insn_in, DL_USAGE_WRITE))
+    return false;
+
+  /* If INSN_IN is a load, only reject implicit predication if its uses
+     directly invalidate it.  */
+  if (arm_mve_load_store_insn_p (insn_in, DL_USAGE_READ))
+    {
+      if (!arm_mve_impl_pred_on_outputs_p (insn_in, vctp_vpr_generated))
+	return false;
+      return true;
+    }
+
+  /* If INSN_IN operates across vector lanes and does not have the
+     "mve_safe_imp_xlane_pred" attribute, then it is always unsafe to implicitly
+     predicate.  */
+  if (arm_mve_across_lane_insn_p (insn_in)
+      && (get_attr_mve_safe_imp_xlane_pred (insn_in)
+	  != MVE_SAFE_IMP_XLANE_PRED_YES))
+    return false;
+
+  /* If INSN_IN operates on Floating Point elements and we are not compiling
+     with -Ofast, then it is unsafe to implicitly predicate it as we may be
+     changing exception and cumulative bits behaviour.  */
+  if (!flag_unsafe_math_optimizations
+      && flag_trapping_math
+      && MVE_VPT_UNPREDICATED_INSN_P (insn_in))
+    {
+      df_ref def;
+      FOR_EACH_INSN_DEF (def, insn_in)
+	if (DF_REF_TYPE (def) == DF_REF_REG_DEF
+	    && FLOAT_MODE_P (GET_MODE (DF_REF_REG (def))))
+	  return false;
+      FOR_EACH_INSN_USE (def, insn_in)
+	if (DF_REF_TYPE (def) == DF_REF_REG_DEF
+	    && FLOAT_MODE_P (GET_MODE (DF_REF_REG (def))))
+	  return false;
+    }
+
+  /* If INSN_IN is a VCTP instruction, then it is safe to implicitly predicate,
+     but instructions that use this predicate will need to be checked
+     just like any other UNPREDICATED MVE instruction.  */
+  if (arm_get_required_vpr_reg_ret_val (insn_in)
+      && (arm_mve_get_vctp_lanes (insn_in) != 0))
+    return true;
+
+  /* Otherwise, check if the implicit predication of INSN_IN can be validated
+     based on its inputs, and if not check whether it can be validated based on
+     how its outputs are used.  */
+  return (arm_mve_impl_pred_on_inputs_p (props_zero_set, insn_in, vctp_vpr_generated)
+	  || arm_mve_impl_pred_on_outputs_p (insn_in, vctp_vpr_generated));
+}
+
+/* Helper function to `arm_mve_dlstp_check_inc_counter` and to
+   `arm_mve_dlstp_check_dec_counter`.  In the situations where the loop counter
+   is incrementing by 1 or decrementing by 1 in each iteration, ensure that the
+   number of iterations, the value of REG, going into the loop, was calculated
+   as:
+    REG = (N + [1, VCTP_STEP - 1]) / VCTP_STEP
+
+  where N is equivalent to the VCTP_REG.
+*/
+
+static bool
+arm_mve_check_reg_origin_is_num_elems (loop *loop, rtx reg, rtx vctp_step,
+				       rtx vctp_reg)
+{
+  df_ref counter_max_last_def = NULL;
+
+  /* More than one reaching definition.  */
+  if (DF_REG_DEF_COUNT (REGNO (reg)) > 2)
+    return false;
+
+  /* Look for a single defition of REG going into the loop.  The DEF_CHAIN will
+     have at least two values, as this is a loop induction variable that is
+     defined outside the loop.  */
+  for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
+       def;
+       def = DF_REF_NEXT_REG (def))
+    {
+      /* Skip the update inside the loop, this has already been checked by the
+	 iv_analyze call earlier.  */
+      if (DF_REF_BB (def) == loop->header)
+	continue;
+
+      counter_max_last_def = def;
+      break;
+    }
+
+  if (!counter_max_last_def)
+    return false;
+
+  rtx counter_max_last_set = single_set (DF_REF_INSN (counter_max_last_def));
+
+  if (!counter_max_last_set)
+    return false;
+
+  /* If we encounter a simple SET from a REG, follow it through.  */
+  if (REG_P (SET_SRC (counter_max_last_set)))
+    {
+      if (DF_REG_DEF_COUNT (REGNO (SET_SRC (counter_max_last_set))) != 1)
+	return false;
+
+      counter_max_last_def
+	= DF_REG_DEF_CHAIN (REGNO (SET_SRC (counter_max_last_set)));
+      counter_max_last_set
+	= single_set (DF_REF_INSN (counter_max_last_def));
+
+      if (!counter_max_last_set)
+	return false;
+    }
+
+  /* We are looking for:
+      COUNTER_MAX_LAST_SET = (N + VCTP_STEP - 1) / VCTP_STEP.
+     We currently only support the unsigned VCTP_OP case.  */
+  rtx division = SET_SRC (counter_max_last_set);
+  if (GET_CODE (division) != LSHIFTRT)
+    return false;
+
+  /* Now check that we are dividing by VCTP_STEP, i.e. the number of lanes.  */
+  rtx divisor = XEXP (division, 1);
+  unsigned vctp_step_cst = abs_hwi (INTVAL (vctp_step));
+  if (!CONST_INT_P (divisor)
+      || (1U << INTVAL (divisor) != vctp_step_cst))
+    return false;
+
+  rtx dividend = XEXP (division, 0);
+  if (!REG_P (dividend))
+    /* Subreg? */
+    return false;
+
+  /* For now only support the simple case, this only works for unsigned N, any
+     signed N will have further computations to deal with overflow.  */
+  if (DF_REG_DEF_COUNT (REGNO (dividend)) != 1)
+    return false;
+
+  rtx_insn *dividend_insn = DF_REF_INSN (DF_REG_DEF_CHAIN (REGNO (dividend)));
+  rtx dividend_op = single_set (dividend_insn);
+  if (!dividend_op
+      && GET_CODE (SET_SRC (dividend_op)) != PLUS)
+    return false;
+
+  /* Check if PLUS_OP is (VCTP_OP + VAL), where VAL = [1, VCTP_STEP - 1].  */
+  rtx plus_op = SET_SRC (dividend_op);
+  if (!REG_P (XEXP (plus_op, 0))
+      || !CONST_INT_P (XEXP (plus_op, 1))
+      || !IN_RANGE (INTVAL (XEXP (plus_op, 1)), 1, vctp_step_cst - 1))
+    return false;
+
+  /* VCTP_REG may have been copied before entering the loop, let's see if we can
+     trace such a copy back.  If we have more than one reaching definition then
+     bail out as analysis will be too difficult.  */
+  if (DF_REG_DEF_COUNT (REGNO (vctp_reg)) > 2)
+    return false;
+
+  /* Look for the definition of N. */
+  for (df_ref def = DF_REG_DEF_CHAIN (REGNO (vctp_reg));
+       def;
+       def = DF_REF_NEXT_REG (def))
+    {
+      if (DF_REF_BB (def) == loop->header)
+       continue;
+      rtx set = single_set (DF_REF_INSN (def));
+      if (set
+	  && REG_P (SET_SRC (set))
+	  && !HARD_REGISTER_P (SET_SRC (set)))
+	vctp_reg = SET_SRC (set);
+    }
+
+  return rtx_equal_p (vctp_reg, XEXP (plus_op, 0));
+}
+
+/* If we have identified the loop to have an incrementing counter, we need to
+   make sure that it increments by 1 and that the loop is structured correctly:
+    * The counter starts from 0
+    * The counter terminates at (num_of_elem + num_of_lanes - 1) / num_of_lanes
+    * The vctp insn uses a reg that decrements appropriately in each iteration.
+*/
+
+static rtx_insn*
+arm_mve_dlstp_check_inc_counter (loop *loop, rtx_insn* vctp_insn,
+				 rtx condconst, rtx condcount)
+{
+  rtx vctp_reg = XVECEXP (XEXP (PATTERN (vctp_insn), 1), 0, 0);
+  /* The loop latch has to be empty.  When compiling all the known MVE LoLs in
+     user applications, none of those with incrementing counters had any real
+     insns in the loop latch.  As such, this function has only been tested with
+     an empty latch and may misbehave or ICE if we somehow get here with an
+     increment in the latch, so, for correctness, error out early.  */
+  if (!empty_block_p (loop->latch))
+    return NULL;
+
+  class rtx_iv vctp_reg_iv;
+  /* For loops of DLSTP_TYPE_B, the loop counter is independent of the decrement
+     of the reg used in the vctp_insn. So run iv analysis on that reg.  This
+     has to succeed for such loops to be supported.  */
+  if (!iv_analyze (vctp_insn, as_a<scalar_int_mode> (GET_MODE (vctp_reg)),
+      vctp_reg, &vctp_reg_iv))
+    return NULL;
+
+  /* Extract the decrementnum of the vctp reg from the iv.  This decrementnum
+     is the number of lanes/elements it decrements from the remaining number of
+     lanes/elements to process in the loop, for this reason this is always a
+     negative number, but to simplify later checks we use it's absolute value.  */
+  HOST_WIDE_INT decrementnum = INTVAL (vctp_reg_iv.step);
+  if (decrementnum >= 0)
+    return NULL;
+  decrementnum = abs_hwi (decrementnum);
+
+  /* Find where both of those are modified in the loop header bb.  */
+  df_ref condcount_reg_set_df = df_bb_regno_only_def_find (loop->header,
+							   REGNO (condcount));
+  df_ref vctp_reg_set_df = df_bb_regno_only_def_find (loop->header,
+						      REGNO (vctp_reg));
+  if (!condcount_reg_set_df || !vctp_reg_set_df)
+    return NULL;
+  rtx condcount_reg_set = single_set (DF_REF_INSN (condcount_reg_set_df));
+  rtx vctp_reg_set = single_set (DF_REF_INSN (vctp_reg_set_df));
+  if (!condcount_reg_set || !vctp_reg_set)
+    return NULL;
+
+  /* Ensure the modification of the vctp reg from df is consistent with
+     the iv and the number of lanes on the vctp insn.  */
+  if (GET_CODE (SET_SRC (vctp_reg_set)) != PLUS
+      || !REG_P (SET_DEST (vctp_reg_set))
+      || !REG_P (XEXP (SET_SRC (vctp_reg_set), 0))
+      || REGNO (SET_DEST (vctp_reg_set))
+	  != REGNO (XEXP (SET_SRC (vctp_reg_set), 0))
+      || !CONST_INT_P (XEXP (SET_SRC (vctp_reg_set), 1))
+      || INTVAL (XEXP (SET_SRC (vctp_reg_set), 1)) >= 0
+      || decrementnum != abs_hwi (INTVAL (XEXP (SET_SRC (vctp_reg_set), 1)))
+      || decrementnum != arm_mve_get_vctp_lanes (vctp_insn))
+    return NULL;
+
+  if (REG_P (condcount) && REG_P (condconst))
+    {
+      /* First we need to prove that the loop is going 0..condconst with an
+	 inc of 1 in each iteration.  */
+      if (GET_CODE (SET_SRC (condcount_reg_set)) == PLUS
+	  && CONST_INT_P (XEXP (SET_SRC (condcount_reg_set), 1))
+	  && INTVAL (XEXP (SET_SRC (condcount_reg_set), 1)) == 1)
+	{
+	    rtx counter_reg = SET_DEST (condcount_reg_set);
+	    /* Check that the counter did indeed start from zero.  */
+	    df_ref this_set = DF_REG_DEF_CHAIN (REGNO (counter_reg));
+	    if (!this_set)
+	      return NULL;
+	    df_ref last_set_def = DF_REF_NEXT_REG (this_set);
+	    if (!last_set_def)
+	      return NULL;
+	    rtx_insn* last_set_insn = DF_REF_INSN (last_set_def);
+	    rtx last_set = single_set (last_set_insn);
+	    if (!last_set)
+	      return NULL;
+	    rtx counter_orig_set;
+	    counter_orig_set = SET_SRC (last_set);
+	    if (!CONST_INT_P (counter_orig_set)
+		|| (INTVAL (counter_orig_set) != 0))
+	      return NULL;
+	    /* And finally check that the target value of the counter,
+	       condconst, is of the correct shape.  */
+	    if (!arm_mve_check_reg_origin_is_num_elems (loop, condconst,
+							vctp_reg_iv.step,
+							vctp_reg))
+	      return NULL;
+	}
+      else
+	return NULL;
+    }
+  else
+    return NULL;
+
+  /* Everything looks valid.  */
+  return vctp_insn;
+}
+
+/* Helper function to `arm_mve_loop_valid_for_dlstp`.  In the case of a
+   counter that is decrementing, ensure that it is decrementing by the
+   right amount in each iteration and that the target condition is what
+   we expect.  */
+
+static rtx_insn*
+arm_mve_dlstp_check_dec_counter (loop *loop, rtx_insn* vctp_insn,
+				 rtx condconst, rtx condcount)
+{
+  rtx vctp_reg = XVECEXP (XEXP (PATTERN (vctp_insn), 1), 0, 0);
+  class rtx_iv vctp_reg_iv;
+  HOST_WIDE_INT decrementnum;
+  /* For decrementing loops of DLSTP_TYPE_A, the counter is usually present in the
+     loop latch.  Here we simply need to verify that this counter is the same
+     reg that is also used in the vctp_insn and that it is not otherwise
+     modified.  */
+  rtx_insn *dec_insn = BB_END (loop->latch);
+  /* If not in the loop latch, try to find the decrement in the loop header.  */
+  if (!NONDEBUG_INSN_P (dec_insn))
+  {
+    df_ref temp = df_bb_regno_only_def_find (loop->header, REGNO (condcount));
+    /* If we haven't been able to find the decrement, bail out.  */
+    if (!temp)
+      return NULL;
+    dec_insn = DF_REF_INSN (temp);
+  }
+
+  rtx dec_set = single_set (dec_insn);
+
+  /* Next, ensure that it is a PLUS of the form:
+     (set (reg a) (plus (reg a) (const_int)))
+     where (reg a) is the same as condcount.  */
+  if (!dec_set
+      || !REG_P (SET_DEST (dec_set))
+      || !REG_P (XEXP (SET_SRC (dec_set), 0))
+      || !CONST_INT_P (XEXP (SET_SRC (dec_set), 1))
+      || REGNO (SET_DEST (dec_set))
+	  != REGNO (XEXP (SET_SRC (dec_set), 0))
+      || REGNO (SET_DEST (dec_set)) != REGNO (condcount))
+    return NULL;
+
+  decrementnum = INTVAL (XEXP (SET_SRC (dec_set), 1));
+
+  /* This decrementnum is the number of lanes/elements it decrements from the
+     remaining number of lanes/elements to process in the loop, for this reason
+     this is always a negative number, but to simplify later checks we use its
+     absolute value.  */
+  if (decrementnum >= 0)
+    return NULL;
+  decrementnum = -decrementnum;
+
+  /* If the decrementnum is a 1, then we need to look at the loop vctp_reg and
+     verify that it also decrements correctly.
+     Then, we need to establish that the starting value of the loop decrement
+     originates from the starting value of the vctp decrement.  */
+  if (decrementnum == 1)
+    {
+      class rtx_iv vctp_reg_iv, condcount_reg_iv;
+      /* The loop counter is found to be independent of the decrement
+	 of the reg used in the vctp_insn, again.  Ensure that IV analysis
+	 succeeds and check the step.  */
+      if (!iv_analyze (vctp_insn, as_a<scalar_int_mode> (GET_MODE (vctp_reg)),
+		       vctp_reg, &vctp_reg_iv))
+	return NULL;
+      /* Ensure it matches the number of lanes of the vctp instruction.  */
+      if (abs (INTVAL (vctp_reg_iv.step))
+	  != arm_mve_get_vctp_lanes (vctp_insn))
+	return NULL;
+
+      if (!arm_mve_check_reg_origin_is_num_elems (loop, condcount,
+						  vctp_reg_iv.step,
+						  vctp_reg))
+	return NULL;
+    }
+  /* If the decrements are the same, then the situation is simple: either they
+     are also the same reg, which is safe, or they are different registers, in
+     which case makse sure that there is a only simple SET from one to the
+     other inside the loop.*/
+  else if (decrementnum == arm_mve_get_vctp_lanes (vctp_insn))
+    {
+      if (REGNO (condcount) != REGNO (vctp_reg))
+	{
+	  /* It wasn't the same reg, but it could be behild a
+	     (set (vctp_reg) (condcount)), so instead find where
+	     the VCTP insn is DEF'd inside the loop.  */
+	  rtx_insn *vctp_reg_insn
+	    = DF_REF_INSN (df_bb_regno_only_def_find (loop->header,
+						      REGNO (vctp_reg)));
+	  rtx vctp_reg_set = single_set (vctp_reg_insn);
+	  /* This must just be a simple SET from the condcount.  */
+	  if (!vctp_reg_set
+	      || !REG_P (SET_DEST (vctp_reg_set))
+	      || !REG_P (SET_SRC (vctp_reg_set))
+	      || REGNO (SET_SRC (vctp_reg_set)) != REGNO (condcount))
+	    return NULL;
+	}
+    }
+  else
+    return NULL;
+
+  /* We now only need to find out that the loop terminates with a LE
+     zero condition.  If condconst is a const_int, then this is easy.
+     If its a REG, look at the last condition+jump in a bb before
+     the loop, because that usually will have a branch jumping over
+     the loop header.  */
+  rtx_insn *jump_insn = BB_END (loop->header);
+  if (CONST_INT_P (condconst)
+      && !(INTVAL (condconst) == 0 && JUMP_P (jump_insn)
+	   && GET_CODE (XEXP (PATTERN (jump_insn), 1)) == IF_THEN_ELSE
+	   && (GET_CODE (XEXP (XEXP (PATTERN (jump_insn), 1), 0)) == NE
+	       ||GET_CODE (XEXP (XEXP (PATTERN (jump_insn), 1), 0)) == GT)))
+    return NULL;
+  else if (REG_P (condconst))
+    {
+      basic_block pre_loop_bb = single_pred (loop_preheader_edge (loop)->src);
+      if (!pre_loop_bb)
+	return NULL;
+
+      rtx initial_compare = NULL_RTX;
+      if (!(prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb))
+	    && INSN_P (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb)))))
+	return NULL;
+      else
+	initial_compare
+	    = single_set (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb)));
+      if (!(initial_compare
+	    && cc_register (SET_DEST (initial_compare), VOIDmode)
+	    && GET_CODE (SET_SRC (initial_compare)) == COMPARE
+	    && CONST_INT_P (XEXP (SET_SRC (initial_compare), 1))
+	    && INTVAL (XEXP (SET_SRC (initial_compare), 1)) == 0))
+	return NULL;
+
+      /* Usually this is a LE condition, but it can also just be a GT or an EQ
+	 condition (if the value is unsigned or the compiler knows its not negative)  */
+      rtx_insn *loop_jumpover = BB_END (pre_loop_bb);
+      if (!(JUMP_P (loop_jumpover)
+	    && GET_CODE (XEXP (PATTERN (loop_jumpover), 1)) == IF_THEN_ELSE
+	    && (GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == LE
+		|| GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == GT
+		|| GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == EQ)))
+	return NULL;
+    }
+
+  /* Everything looks valid.  */
+  return vctp_insn;
+}
+
+/* Function to check a loop's structure to see if it is a valid candidate for
+   an MVE Tail Predicated Low-Overhead Loop.  Returns the loop's VCTP_INSN if
+   it is valid, or NULL if it isn't.  */
+
+static rtx_insn*
+arm_mve_loop_valid_for_dlstp (loop *loop)
+{
+  /* Doloop can only be done "elementwise" with predicated dlstp/letp if it
+     contains a VCTP on the number of elements processed by the loop.
+     Find the VCTP predicate generation inside the loop body BB.  */
+  rtx_insn *vctp_insn = arm_mve_get_loop_vctp (loop->header);
+  if (!vctp_insn)
+    return NULL;
+
+  /* We only support two loop forms for tail predication:
+      DLSTP_TYPE_A) Loops of the form:
+	  int num_of_lanes = 128 / elem_size;
+	  while (num_of_elem > 0)
+	    {
+	      p = vctp<size> (num_of_elem);
+	      num_of_elem -= num_of_lanes;
+	    }
+      DLSTP_TYPE_B) Loops of the form:
+	  int num_of_lanes = 128 / elem_size;
+	  int num_of_iters = (num_of_elem + num_of_lanes - 1) / num_of_lanes;
+	  for (i = 0; i < num_of_iters; i++)
+	    {
+	      p = vctp<size> (num_of_elem);
+	      num_of_elem -= num_of_lanes;
+	    }
+
+    Then, depending on the type of loop above we need will need to do
+    different sets of checks.  */
+  iv_analysis_loop_init (loop);
+
+  /* In order to find out if the loop is of DLSTP_TYPE_A or DLSTP_TYPE_B above
+     look for the loop counter: it will either be incrementing by one per
+     iteration or it will be decrementing by num_of_lanes.  We can find the
+     loop counter in the condition at the end of the loop.  */
+  rtx_insn *loop_cond = prev_nonnote_nondebug_insn_bb (BB_END (loop->header));
+  if (!(cc_register (XEXP (PATTERN (loop_cond), 0), VOIDmode)
+	&& GET_CODE (XEXP (PATTERN (loop_cond), 1)) == COMPARE))
+    return NULL;
+
+  /* The operands in the condition:  Try to identify which one is the
+     constant and which is the counter and run IV analysis on the latter.  */
+  rtx cond_arg_1 = XEXP (XEXP (PATTERN (loop_cond), 1), 0);
+  rtx cond_arg_2 = XEXP (XEXP (PATTERN (loop_cond), 1), 1);
+
+  rtx loop_cond_constant;
+  rtx loop_counter;
+  class rtx_iv cond_counter_iv, cond_temp_iv;
+
+  if (CONST_INT_P (cond_arg_1))
+    {
+      /* cond_arg_1 is the constant and cond_arg_2 is the counter.  */
+      loop_cond_constant = cond_arg_1;
+      loop_counter = cond_arg_2;
+      iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_2)),
+		  cond_arg_2, &cond_counter_iv);
+    }
+  else if (CONST_INT_P (cond_arg_2))
+    {
+      /* cond_arg_2 is the constant and cond_arg_1 is the counter.  */
+      loop_cond_constant = cond_arg_2;
+      loop_counter = cond_arg_1;
+      iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_1)),
+		  cond_arg_1, &cond_counter_iv);
+    }
+  else if (REG_P (cond_arg_1) && REG_P (cond_arg_2))
+    {
+      /* If both operands to the compare are REGs, we can safely
+	 run IV analysis on both and then determine which is the
+	 constant by looking at the step.
+	 First assume cond_arg_1 is the counter.  */
+      loop_counter = cond_arg_1;
+      loop_cond_constant = cond_arg_2;
+      iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_1)),
+		  cond_arg_1, &cond_counter_iv);
+      iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_2)),
+		  cond_arg_2, &cond_temp_iv);
+
+      /* Look at the steps and swap around the rtx's if needed.  Error out if
+	 one of them cannot be identified as constant.  */
+      if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P (cond_temp_iv.step))
+	return NULL;
+      if (INTVAL (cond_counter_iv.step) != 0 && INTVAL (cond_temp_iv.step) != 0)
+	return NULL;
+      if (INTVAL (cond_counter_iv.step) == 0 && INTVAL (cond_temp_iv.step) != 0)
+	{
+	  loop_counter = cond_arg_2;
+	  loop_cond_constant = cond_arg_1;
+	  cond_counter_iv = cond_temp_iv;
+	}
+    }
+  else
+    return NULL;
+
+  if (!REG_P (loop_counter))
+    return NULL;
+  if (!(REG_P (loop_cond_constant) || CONST_INT_P (loop_cond_constant)))
+    return NULL;
+
+  /* Now we have extracted the IV step of the loop counter, call the
+     appropriate checking function.  */
+  if (INTVAL (cond_counter_iv.step) > 0)
+    return arm_mve_dlstp_check_inc_counter (loop, vctp_insn,
+					    loop_cond_constant, loop_counter);
+  else if (INTVAL (cond_counter_iv.step) < 0)
+    return arm_mve_dlstp_check_dec_counter (loop, vctp_insn,
+					    loop_cond_constant, loop_counter);
+  else
+    return NULL;
+}
+
+/* Predict whether the given loop in gimple will be transformed in the RTL
+   doloop_optimize pass.  It could be argued that turning large enough loops
+   into low-overhead loops would not show a signficant performance boost.
+   However, in the case of tail predication we would still avoid using VPT/VPST
+   instructions inside the loop, and in either case using low-overhead loops
+   would not be detrimental, so we decided to not consider size, avoiding the
+   need of a heuristic to determine what an appropriate size boundary is.  */
+
+static bool
+arm_predict_doloop_p (struct loop *loop)
+{
+  gcc_assert (loop);
+  /* On arm, targetm.can_use_doloop_p is actually
+     can_use_doloop_if_innermost.  Ensure the loop is innermost,
+     it is valid and as per arm_target_bb_ok_for_lob and the
+     correct architecture flags are enabled.  */
+  if (!(TARGET_HAVE_LOB && optimize > 0))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Predict doloop failure due to"
+			    " target architecture or optimisation flags.\n");
+      return false;
+    }
+  else if (loop->inner != NULL)
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Predict doloop failure due to"
+			    " loop nesting.\n");
+      return false;
+    }
+  else if (!arm_target_bb_ok_for_lob (loop->header->next_bb))
+    {
+      if (dump_file && (dump_flags & TDF_DETAILS))
+	fprintf (dump_file, "Predict doloop failure due to"
+			    " loop bb complexity.\n");
+      return false;
+    }
+
+  return true;
+}
+
+/* Implement targetm.loop_unroll_adjust.  Use this to block unrolling of loops
+   that may later be turned into MVE Tail Predicated Low Overhead Loops.  The
+   performance benefit of an MVE LoL is likely to be much higher than that of
+   the unrolling.  */
+
+unsigned
+arm_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
+{
+  if (TARGET_HAVE_MVE
+      && arm_target_bb_ok_for_lob (loop->latch)
+      && arm_mve_loop_valid_for_dlstp (loop))
+    return 0;
+  else
+    return nunroll;
+}
+
+/* Function to hadle emitting a VPT-unpredicated version of a VPT-predicated
+   insn to a sequence.  */
+
+static bool
+arm_emit_mve_unpredicated_insn_to_seq (rtx_insn* insn)
+{
+  rtx insn_vpr_reg_operand = arm_get_required_vpr_reg_param (insn);
+  int new_icode = get_attr_mve_unpredicated_insn (insn);
+  if (!in_sequence_p ()
+      || !MVE_VPT_PREDICATED_INSN_P (insn)
+      || (!insn_vpr_reg_operand)
+      || (!new_icode))
+    return false;
+
+  extract_insn (insn);
+  rtx arr[8];
+  int j = 0;
+
+  /* When transforming a VPT-predicated instruction into its unpredicated
+     equivalent we need to drop the VPR operand and we may need to also drop a
+     merge "vuninit" input operand, depending on the instruction pattern.  Here
+     ensure that we have at most a two-operand difference between the two
+     instrunctions.  */
+  int n_operands_diff
+      = recog_data.n_operands - insn_data[new_icode].n_operands;
+  if (!(n_operands_diff > 0 && n_operands_diff <= 2))
+    return false;
+
+  rtx move = NULL_RTX;
+  /* Then, loop through the operands of the predicated
+     instruction, and retain the ones that map to the
+     unpredicated instruction.  */
+  for (int i = 0; i < recog_data.n_operands; i++)
+    {
+      /* Ignore the VPR and, if needed, the vuninit
+	 operand.  */
+      if (insn_vpr_reg_operand == recog_data.operand[i])
+	continue;
+      if (n_operands_diff == 2
+	  && !strcmp (recog_data.constraints[i], "0"))
+	{
+	  move = gen_rtx_SET (arr[0], recog_data.operand[i]);
+	  arr[0] = recog_data.operand[i];
+	}
+      else
+	arr[j++] = recog_data.operand[i];
+    }
+
+  /* Finally, emit the upredicated instruction.  */
+  rtx_insn *new_insn;
+  switch (j)
+    {
+      case 1:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0]));
+	break;
+      case 2:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1]));
+	break;
+      case 3:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2]));
+	break;
+      case 4:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+						   arr[3]));
+	break;
+      case 5:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+						   arr[3], arr[4]));
+	break;
+      case 6:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+						   arr[3], arr[4], arr[5]));
+	break;
+      case 7:
+	new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+						   arr[3], arr[4], arr[5],
+						   arr[6]));
+	break;
+      default:
+	gcc_unreachable ();
+    }
+  INSN_LOCATION (new_insn) = INSN_LOCATION (insn);
+  if (move)
+    {
+      new_insn = emit_insn (move);
+      INSN_LOCATION (new_insn) = INSN_LOCATION (insn);
+    }
+  return true;
+}
+
+/* Return TRUE if INSN defines a MVE vector operand that has zeroed
+   tail-predicated lanes.  This is either true if:
+   * INSN is predicated by VCTP_VPR_GENERATED and the 'invalid lanes' operand
+     is in the PROPS_ZERO_SET,
+   * all MVE vector operands are in the PROPS_ZERO_SET
+*/
+
+static bool
+arm_mve_propagate_zero_pred_p (vec <rtx_insn *> *props_zero_set,
+			       rtx_insn *insn, rtx vctp_vpr_generated)
+{
+  if (arm_mve_load_store_insn_p (insn, DL_USAGE_READ))
+    return true;
+  if (arm_mve_load_store_insn_p (insn, DL_USAGE_WRITE))
+    return false;
+
+  int inactive_idx = -1;
+
+  extract_insn (insn);
+  /* If INSN is predicated by VCTP_VPR_GENERATED, then all tail-predicated
+     lanes will keep the value that is in the 'invalid lanes' register which we
+     identify by the "0" constraint, to ensure it is the same as the 'result'
+     register of this instruction.  */
+  if (arm_mve_insn_predicated_by (insn, vctp_vpr_generated))
+    {
+      for (int i = 0; i < recog_data.n_operands; i++)
+	{
+	  if (strcmp (recog_data.constraints[i], "0") == 0
+	      && VALID_MVE_MODE (GET_MODE (recog_data.operand[i])))
+	    {
+	      inactive_idx = i;
+	      break;
+	    }
+	}
+    }
+
+  if (inactive_idx > 0)
+    {
+      rtx op = recog_data.operand[inactive_idx];
+      rtx_insn *def_insn =  arm_last_vect_def_insn (op, insn);
+      return def_insn != NULL_RTX && props_zero_set->contains (def_insn);
+    }
+
+  /* If this instruction is not predicated by VCTP_VPR_GENERATED, then we must
+     check that all vector operands have zeroed tail-predicated lanes, and that
+     it has at least one vector operand.  */
+  bool at_least_one_vector = false;
+  df_ref insn_uses;
+  FOR_EACH_INSN_USE (insn_uses, insn)
+    {
+      rtx reg = DF_REF_REG (insn_uses);
+      if (!VALID_MVE_MODE (GET_MODE (reg)))
+	continue;
+
+      rtx_insn *def_insn = arm_last_vect_def_insn (reg, insn);
+      if (def_insn && props_zero_set->contains (def_insn))
+	at_least_one_vector |= true;
+      else
+	return false;
+
+    }
+  return at_least_one_vector;
+}
+
+
+/* Attempt to transform the loop contents of loop basic block from VPT
+   predicated insns into unpredicated insns for a dlstp/letp loop.  Returns
+   the number to decrement from the total number of elements each iteration.
+   Returns 1 if tail predication can not be performed and fallback to scalar
+   low-overhead loops.  */
+
+int
+arm_attempt_dlstp_transform (rtx label)
+{
+  if (!dlstp_enabled)
+    return 1;
+
+  basic_block body = single_succ (BLOCK_FOR_INSN (label));
+
+  /* Ensure that the bb is within a loop that has all required metadata.  */
+  if (!body->loop_father || !body->loop_father->header
+      || !body->loop_father->simple_loop_desc)
+    return 1;
+
+  loop *loop = body->loop_father;
+  /* Instruction that sets the predicate mask depending on how many elements
+     are left to process.  */
+  rtx_insn *vctp_insn = arm_mve_loop_valid_for_dlstp (loop);
+  if (!vctp_insn)
+    return 1;
+
+  gcc_assert (single_set (vctp_insn));
+
+  rtx vctp_vpr_generated = single_set (vctp_insn);
+  if (!vctp_vpr_generated)
+    return 1;
+
+  vctp_vpr_generated = SET_DEST (vctp_vpr_generated);
+
+  if (!vctp_vpr_generated || !REG_P (vctp_vpr_generated)
+      || !VALID_MVE_PRED_MODE (GET_MODE (vctp_vpr_generated)))
+    return 1;
+
+  /* decrementunum is already known to be valid at this point.  */
+  int decrementnum = arm_mve_get_vctp_lanes (vctp_insn);
+
+  rtx_insn *insn = 0;
+  rtx_insn *cur_insn = 0;
+  rtx_insn *seq;
+  auto_vec <rtx_insn *> props_zero_set;
+
+  /* Scan through the insns in the loop bb and emit the transformed bb
+     insns to a sequence.  */
+  start_sequence ();
+  FOR_BB_INSNS (body, insn)
+    {
+      if (GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn))
+	continue;
+      else if (NOTE_P (insn))
+	emit_note ((enum insn_note)NOTE_KIND (insn));
+      else if (DEBUG_INSN_P (insn))
+	emit_debug_insn (PATTERN (insn));
+      else if (!INSN_P (insn))
+	{
+	  end_sequence ();
+	  return 1;
+	}
+      /* If the transformation is successful we no longer need the vctp
+	 instruction.  */
+      else if (insn == vctp_insn)
+	continue;
+      /* If the insn pattern requires the use of the VPR value from the
+	 vctp as an input parameter for predication.  */
+      else if (arm_mve_insn_predicated_by (insn, vctp_vpr_generated))
+	{
+	  /* Check whether this INSN propagates the zeroed tail-predication
+	     lanes.  */
+	  if (arm_mve_propagate_zero_pred_p (&props_zero_set, insn,
+					     vctp_vpr_generated))
+	    props_zero_set.safe_push (insn);
+	  bool success = arm_emit_mve_unpredicated_insn_to_seq (insn);
+	  if (!success)
+	    {
+	      end_sequence ();
+	      return 1;
+	    }
+	}
+      /* If the insn isn't VPT predicated on vctp_vpr_generated, we need to
+	 make sure that it is still valid within the dlstp/letp loop.  */
+      else
+	{
+	  /* If this instruction USE-s the vctp_vpr_generated other than for
+	     predication, this blocks the transformation as we are not allowed
+	     to optimise the VPR value away.  */
+	  df_ref insn_uses = NULL;
+	  FOR_EACH_INSN_USE (insn_uses, insn)
+	  {
+	    if (rtx_equal_p (vctp_vpr_generated, DF_REF_REG (insn_uses)))
+	      {
+		end_sequence ();
+		return 1;
+	      }
+	  }
+	  /* If within the loop we have an MVE vector instruction that is
+	     unpredicated, the dlstp/letp looping will add implicit
+	     predication to it.  This will result in a change in behaviour
+	     of the instruction, so we need to find out if any instructions
+	     that feed into the current instruction were implicitly
+	     predicated.  */
+	  if (MVE_VPT_PREDICABLE_INSN_P (insn)
+	      && !arm_mve_impl_predicated_p (&props_zero_set, insn,
+					     vctp_vpr_generated))
+	    {
+	      end_sequence ();
+	      return 1;
+	    }
+	  emit_insn (PATTERN (insn));
+	}
+    }
+  seq = get_insns ();
+  end_sequence ();
+
+  /* Re-write the entire BB contents with the transformed
+     sequence.  */
+  FOR_BB_INSNS_SAFE (body, insn, cur_insn)
+    if (!(GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn)))
+      delete_insn (insn);
+
+  emit_insn_after (seq, BB_END (body));
+
+  /* The transformation has succeeded, so now modify the "count"
+     (a.k.a. niter_expr) for the middle-end.  Also set noloop_assumptions
+     to NULL to stop the middle-end from making assumptions about the
+     number of iterations.  */
+  simple_loop_desc (body->loop_father)->niter_expr
+    = XVECEXP (SET_SRC (PATTERN (vctp_insn)), 0, 0);
+  simple_loop_desc (body->loop_father)->noloop_assumptions = NULL_RTX;
+  return decrementnum;
 }
 
 #if CHECKING_P
diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
index 0cd3fc2cd0c..d88c7a52e75 100644
--- a/gcc/config/arm/arm.opt
+++ b/gcc/config/arm/arm.opt
@@ -363,5 +363,8 @@ Target Joined RejectNegative String Var(arm_stack_protector_guard_offset_str)
 Use an immediate to offset from the TLS register. This option is for use with
 fstack-protector-guard=tls and not for use in user-land code.
 
+mdlstp
+Target Var(dlstp_enabled) Init(1) Undocumented
+
 TargetVariable
 long arm_stack_protector_guard_offset = 0
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 8d066fcf05d..987602da1bf 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -2686,6 +2686,17 @@
 (define_int_attr mrrc [(VUNSPEC_MRRC "mrrc") (VUNSPEC_MRRC2 "mrrc2")])
 (define_int_attr MRRC [(VUNSPEC_MRRC "MRRC") (VUNSPEC_MRRC2 "MRRC2")])
 
+(define_int_attr dlstp_elemsize [(DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32")
+				 (DLSTP64 "64")])
+
+(define_int_attr letp_num_lanes [(LETP8 "16") (LETP16 "8") (LETP32 "4")
+				 (LETP64 "2")])
+(define_int_attr letp_num_lanes_neg [(LETP8 "-16") (LETP16 "-8") (LETP32 "-4")
+				     (LETP64 "-2")])
+
+(define_int_attr letp_num_lanes_minus_1 [(LETP8 "15") (LETP16 "7") (LETP32 "3")
+					 (LETP64 "1")])
+
 (define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
 			   (UNSPEC_DOT_U "u8")
 			   (UNSPEC_DOT_US "s8")
@@ -2926,6 +2937,10 @@
 (define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
 (define_int_iterator VQSHLUQ_M_N [VQSHLUQ_M_N_S])
 (define_int_iterator VQSHLUQ_N [VQSHLUQ_N_S])
+(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32
+				   DLSTP64])
+(define_int_iterator LETP [LETP8 LETP16 LETP32
+			   LETP64])
 
 ;; Define iterators for VCMLA operations
 (define_int_iterator VCMLA_OP [UNSPEC_VCMLA
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 9fe51298cdc..4b4d6298ffb 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -6930,3 +6930,53 @@
       }
   }
 )
+
+;; Originally expanded by 'predicated_doloop_end'.
+;; In the rare situation where the branch is too far, we do also need to
+;; revert FPSCR.LTPSIZE back to 0x100 after the last iteration.
+(define_insn "predicated_doloop_end_internal<letp_num_lanes>"
+  [(set (pc)
+	(if_then_else
+	 (gtu (plus:SI (reg:SI LR_REGNUM)
+	       (const_int <letp_num_lanes_neg>))
+	      (const_int <letp_num_lanes_minus_1>))
+	 (match_operand 0 "" "")
+	 (pc)))
+   (set (reg:SI LR_REGNUM)
+	(plus:SI (reg:SI LR_REGNUM) (const_int <letp_num_lanes_neg>)))
+  ;; We use UNSPEC here to guarantee this pattern can not be
+  ;; generated by a RTL optimization and be matched by other
+  ;; patterns, since this pattern is also responsible for turning off
+  ;; the tail predication machinery if we were to exit the loop.
+  ;; This is done by either the LETP or the LCTP instructions that
+  ;; this pattern generates.
+   (use (unspec:SI [(const_int 0)] LETP))
+   (clobber (reg:CC CC_REGNUM))]
+  "TARGET_HAVE_MVE"
+  {
+    if (get_attr_length (insn) == 4)
+      return "letp\t%|lr, %l0";
+    else
+      return "subs\t%|lr, #<letp_num_lanes>\n\tbhi\t%l0\n\tlctp";
+  }
+  [(set (attr "length")
+	(if_then_else
+	   (ltu (minus (pc) (match_dup 0)) (const_int 1024))
+	    (const_int 4)
+	    (const_int 12)))
+   (set_attr "type" "branch")
+   (set_attr "conds" "unconditional")])
+
+(define_insn "dlstp<dlstp_elemsize>_insn"
+  [
+    (set (reg:SI LR_REGNUM)
+;; Similar to the previous pattern, we use UNSPEC here to make sure this
+;; rtx construct is not matched by other patterns, as this pattern is also
+;; responsible for setting the element size of the tail predication machinery
+;; using the dlsp.<size> instruction.
+	 (unspec_volatile:SI [(match_operand:SI 0 "s_register_operand" "r")]
+	  DLSTP))
+  ]
+  "TARGET_HAVE_MVE"
+  "dlstp.<dlstp_elemsize>\t%|lr, %0"
+  [(set_attr "type" "mve_misc")])
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index 84c9c3dfe80..66b3ae6040c 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1613,7 +1613,7 @@
    (use (match_operand 1 "" ""))]     ; label
   "TARGET_32BIT"
   "
- {
+{
    /* Currently SMS relies on the do-loop pattern to recognize loops
       where (1) the control part consists of all insns defining and/or
       using a certain 'count' register and (2) the loop count can be
@@ -1623,41 +1623,75 @@
 
       Also used to implement the low over head loops feature, which is part of
       the Armv8.1-M Mainline Low Overhead Branch (LOB) extension.  */
-   if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
-   {
-     rtx s0;
-     rtx bcomp;
-     rtx loc_ref;
-     rtx cc_reg;
-     rtx insn;
-     rtx cmp;
-
-     if (GET_MODE (operands[0]) != SImode)
-       FAIL;
-
-     s0 = operands [0];
-
-     /* Low over head loop instructions require the first operand to be LR.  */
-     if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1]))
-       s0 = gen_rtx_REG (SImode, LR_REGNUM);
-
-     if (TARGET_THUMB2)
-       insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
-     else
-       insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
-
-     cmp = XVECEXP (PATTERN (insn), 0, 0);
-     cc_reg = SET_DEST (cmp);
-     bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
-     loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [1]);
-     emit_jump_insn (gen_rtx_SET (pc_rtx,
-                                  gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
-                                                        loc_ref, pc_rtx)));
-     DONE;
-   }
- else
-   FAIL;
- }")
+  if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
+    {
+      rtx s0;
+      rtx bcomp;
+      rtx loc_ref;
+      rtx cc_reg;
+      rtx insn;
+      rtx cmp;
+      int decrement_num;
+
+      if (GET_MODE (operands[0]) != SImode)
+	FAIL;
+
+      s0 = operands[0];
+
+      if (TARGET_HAVE_LOB
+	  && arm_target_bb_ok_for_lob (BLOCK_FOR_INSN (operands[1])))
+	{
+	  /* If we have a compatible MVE target, try and analyse the loop
+	     contents to determine if we can use predicated dlstp/letp
+	     looping.  These patterns implicitly use LR as the loop counter.  */
+	  if (TARGET_HAVE_MVE
+	      && ((decrement_num = arm_attempt_dlstp_transform (operands[1]))
+		  != 1))
+	    {
+	      loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+	      switch (decrement_num)
+		{
+		  case 2:
+		    insn = gen_predicated_doloop_end_internal2 (loc_ref);
+		    break;
+		  case 4:
+		    insn = gen_predicated_doloop_end_internal4 (loc_ref);
+		    break;
+		  case 8:
+		    insn = gen_predicated_doloop_end_internal8 (loc_ref);
+		    break;
+		  case 16:
+		    insn = gen_predicated_doloop_end_internal16 (loc_ref);
+		    break;
+		  default:
+		    gcc_unreachable ();
+		}
+	      emit_jump_insn (insn);
+	      DONE;
+	    }
+	  /* Remaining LOB cases need to explicitly use LR.  */
+	  s0 = gen_rtx_REG (SImode, LR_REGNUM);
+	}
+
+	/* Otherwise, try standard decrement-by-one dls/le looping.  */
+	if (TARGET_THUMB2)
+	  insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0,
+							GEN_INT (-1)));
+	else
+	  insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
+
+	cmp = XVECEXP (PATTERN (insn), 0, 0);
+	cc_reg = SET_DEST (cmp);
+	bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
+	loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+	emit_jump_insn (gen_rtx_SET (pc_rtx,
+				     gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
+							   loc_ref, pc_rtx)));
+	DONE;
+    }
+  else
+    FAIL;
+}")
 
 (define_insn "*clear_apsr"
   [(unspec_volatile:SI [(const_int 0)] VUNSPEC_CLRM_APSR)
@@ -1755,7 +1789,37 @@
   {
     if (REGNO (operands[0]) == LR_REGNUM)
       {
-	emit_insn (gen_dls_insn (operands[0]));
+	/* Pick out the number by which we are decrementing the loop counter
+	   in every iteration.  If it's > 1, then use dlstp.  */
+	int const_int_dec_num
+	     = abs (INTVAL (XEXP (XEXP (XVECEXP (PATTERN (operands[1]), 0, 1),
+				  1),
+			    1)));
+	switch (const_int_dec_num)
+	  {
+	    case 16:
+	      emit_insn (gen_dlstp8_insn (operands[0]));
+	      break;
+
+	    case 8:
+	      emit_insn (gen_dlstp16_insn (operands[0]));
+	      break;
+
+	    case 4:
+	      emit_insn (gen_dlstp32_insn (operands[0]));
+	      break;
+
+	    case 2:
+	      emit_insn (gen_dlstp64_insn (operands[0]));
+	      break;
+
+	    case 1:
+	      emit_insn (gen_dls_insn (operands[0]));
+	      break;
+
+	    default:
+	      gcc_unreachable ();
+	  }
 	DONE;
       }
     else
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index e2b70da1001..9527bdb9e87 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -574,6 +574,7 @@
 ; mve_move
 ; mve_store
 ; mve_load
+; mve_misc
 
 (define_attr "type"
  "adc_imm,\
@@ -1126,7 +1127,8 @@
   ls64,\
   mve_move,\
   mve_store,\
-  mve_load"
+  mve_load, \
+  mve_misc"
    (cond [(eq_attr "autodetect_type" "alu_shift_lsr_op2,alu_shift_asr_op2")
             (const_string "alu_shift_imm_other")
           (eq_attr "autodetect_type" "alu_shift_lsl_op2")
@@ -1292,7 +1294,7 @@
 ;; No otherwise.
 (define_attr "is_mve_type" "yes,no"
         (if_then_else (eq_attr "type"
-        "mve_move, mve_load, mve_store, mrs")
+        "mve_move, mve_load, mve_store, mrs, mve_misc")
         (const_string "yes")
         (const_string "no")))
 
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 46ac8b37157..f5f4d154364 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -591,6 +591,10 @@
   VADDLVQ_U
   VCTP
   VCTP_M
+  LETP8
+  LETP16
+  LETP32
+  LETP64
   VPNOT
   VCREATEQ_F
   VCVTQ_N_TO_F_S
@@ -1259,6 +1263,14 @@
   UQRSHLL_48
   SQRSHRL_64
   SQRSHRL_48
-  VSHLCQ_M_
   REINTERPRET
 ])
+
+; DLSTP unspecs must be volatile to guarantee the scheduler does not reschedule
+; these instructions within the loop preheader.
+(define_c_enum "unspecv" [
+  DLSTP8
+  DLSTP16
+  DLSTP32
+  DLSTP64
+])
diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h
index feaae7cc899..3941fe7a8b6 100644
--- a/gcc/testsuite/gcc.target/arm/lob.h
+++ b/gcc/testsuite/gcc.target/arm/lob.h
@@ -1,15 +1,131 @@
 #include <string.h>
-
+#include <stdint.h>
 /* Common code for lob tests.  */
 
 #define NO_LOB asm volatile ("@ clobber lr" : : : "lr" )
 
-#define N 10000
+#define N 100
+
+static void
+reset_data (int *a, int *b, int *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (b, -1, x * sizeof (*b));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data64 (int64_t *a, int64_t *c, int x)
+{
+  memset (a, -1, x * sizeof (*a));
+  memset (c, 0, x * sizeof (*c));
+}
+
+static void
+check_plus (int *a, int *b, int *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
+
+static void
+check_plus8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
+
+static void
+check_plus16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
+
+static void
+check_plus32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != (a[i] + b[i])) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
+}
 
 static void
-reset_data (int *a, int *b, int *c)
+check_memcpy64 (int64_t *a, int64_t *c, int x)
 {
-  memset (a, -1, N * sizeof (*a));
-  memset (b, -1, N * sizeof (*b));
-  memset (c, -1, N * sizeof (*c));
+  for (int i = 0; i < N; i++)
+    {
+      NO_LOB;
+      if (i < x)
+	{
+	  if (c[i] != a[i]) abort ();
+	}
+      else
+	{
+	  if (c[i] != 0) abort ();
+	}
+    }
 }
diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c
index ba5c82cd55c..c8ce653a5c3 100644
--- a/gcc/testsuite/gcc.target/arm/lob1.c
+++ b/gcc/testsuite/gcc.target/arm/lob1.c
@@ -54,29 +54,18 @@ loop3 (int *a, int *b, int *c)
     } while (i < N);
 }
 
-void
-check (int *a, int *b, int *c)
-{
-  for (int i = 0; i < N; i++)
-    {
-      NO_LOB;
-      if (c[i] != a[i] + b[i])
-	abort ();
-    }
-}
-
 int
 main (void)
 {
-  reset_data (a, b, c);
+  reset_data (a, b, c, N);
   loop1 (a, b ,c);
-  check (a, b ,c);
-  reset_data (a, b, c);
+  check_plus (a, b, c, N);
+  reset_data (a, b, c, N);
   loop2 (a, b ,c);
-  check (a, b ,c);
-  reset_data (a, b, c);
+  check_plus (a, b, c, N);
+  reset_data (a, b, c, N);
   loop3 (a, b ,c);
-  check (a, b ,c);
+  check_plus (a, b, c, N);
 
   return 0;
 }
diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c
index 17b6124295e..4fe116e2c2b 100644
--- a/gcc/testsuite/gcc.target/arm/lob6.c
+++ b/gcc/testsuite/gcc.target/arm/lob6.c
@@ -79,14 +79,14 @@ check (void)
 int
 main (void)
 {
-  reset_data (a1, b1, c1);
-  reset_data (a2, b2, c2);
+  reset_data (a1, b1, c1, N);
+  reset_data (a2, b2, c2, N);
   loop1 (a1, b1, c1);
   ref1 (a2, b2, c2);
   check ();
 
-  reset_data (a1, b1, c1);
-  reset_data (a2, b2, c2);
+  reset_data (a1, b1, c1, N);
+  reset_data (a2, b2, c2, N);
   loop2 (a1, b1, c1);
   ref2 (a2, b2, c2);
   check ();
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
new file mode 100644
index 00000000000..6e6da3d3d59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
@@ -0,0 +1,146 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+
+#define IMM 5
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED)				\
+void test_##NAME##PRED##_##SIGN##BITS (TYPE##BITS##_t *a, TYPE##BITS##_t *b,  TYPE##BITS##_t *c, int n)	\
+{											\
+  while (n > 0)										\
+    {											\
+      mve_pred16_t p = vctp##BITS##q (n);						\
+      TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p);		\
+      TYPE##BITS##x##LANES##_t vb = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (b, p);		\
+      TYPE##BITS##x##LANES##_t vc = NAME##PRED##_##SIGN##BITS (va, vb, p);		\
+      vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p);					\
+      c += LANES;									\
+      a += LANES;									\
+      b += LANES;									\
+      n -= LANES;									\
+    }											\
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY(BITS, LANES, LDRSTRYTPE, NAME, PRED)	\
+TEST_COMPILE_IN_DLSTP_TERNARY (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_TERNARY (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY(NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (8, 16, b, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (16, 8, h, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (32, 4, w, NAME, PRED)
+
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vmulq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vsubq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vhaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vorrq, _x)
+
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_M(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED)				\
+void test_##NAME##PRED##_##SIGN##BITS (TYPE##BITS##x##LANES##_t __inactive, TYPE##BITS##_t *a, TYPE##BITS##_t *b,  TYPE##BITS##_t *c, int n)	\
+{											\
+  while (n > 0)										\
+    {											\
+      mve_pred16_t p = vctp##BITS##q (n);						\
+      TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p);		\
+      TYPE##BITS##x##LANES##_t vb = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (b, p);		\
+      TYPE##BITS##x##LANES##_t vc = NAME##PRED##_##SIGN##BITS (__inactive, va, vb, p);		\
+      vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p);					\
+      c += LANES;									\
+      a += LANES;									\
+      b += LANES;									\
+      n -= LANES;									\
+    }											\
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M(BITS, LANES, LDRSTRYTPE, NAME, PRED)	\
+TEST_COMPILE_IN_DLSTP_TERNARY_M (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_TERNARY_M (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M(NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (8, 16, b, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (16, 8, h, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (32, 4, w, NAME, PRED)
+
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vmulq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vsubq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vhaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vorrq, _m)
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_N(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED)	\
+void test_##NAME##PRED##_n_##SIGN##BITS (TYPE##BITS##_t *a,  TYPE##BITS##_t *c, int n)	\
+{											\
+  while (n > 0)										\
+    {											\
+      mve_pred16_t p = vctp##BITS##q (n);						\
+      TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p);		\
+      TYPE##BITS##x##LANES##_t vc = NAME##PRED##_n_##SIGN##BITS (va, IMM, p);		\
+      vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p);					\
+      c += LANES;									\
+      a += LANES;									\
+      n -= LANES;									\
+    }											\
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N(BITS, LANES, LDRSTRYTPE, NAME, PRED)	\
+TEST_COMPILE_IN_DLSTP_TERNARY_N (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_TERNARY_N (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N(NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (8, 16, b, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (16, 8, h, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (32, 4, w, NAME, PRED)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vmulq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vsubq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vhaddq, _x)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vbrsrq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vshlq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vshrq, _x)
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_M_N(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED)	\
+void test_##NAME##PRED##_n_##SIGN##BITS (TYPE##BITS##x##LANES##_t __inactive, TYPE##BITS##_t *a,  TYPE##BITS##_t *c, int n)	\
+{											\
+  while (n > 0)										\
+    {											\
+      mve_pred16_t p = vctp##BITS##q (n);						\
+      TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p);		\
+      TYPE##BITS##x##LANES##_t vc = NAME##PRED##_n_##SIGN##BITS (__inactive, va, IMM, p);		\
+      vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p);					\
+      c += LANES;									\
+      a += LANES;									\
+      n -= LANES;									\
+    }											\
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N(BITS, LANES, LDRSTRYTPE, NAME, PRED)	\
+TEST_COMPILE_IN_DLSTP_TERNARY_M_N (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_TERNARY_M_N (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N(NAME, PRED)			\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (8, 16, b, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (16, 8, h, NAME, PRED)				\
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (32, 4, w, NAME, PRED)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vmulq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vsubq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vhaddq, _m)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vbrsrq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshlq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshrq, _m)
+
+/* The final number of DLSTPs currently is calculated by the number of
+  `TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY.*` macros * 6.  */
+/* { dg-final { scan-assembler-times {\tdlstp} 144 } } */
+/* { dg-final { scan-assembler-times {\tletp} 144 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
new file mode 100644
index 00000000000..84f4a2fc4f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
@@ -0,0 +1,749 @@
+
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps -fno-schedule-insns2 " } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-mtune=cortex-m55" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_mve.h>
+/* Using a >=1 condition.  */
+void test1 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n >= 1)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+/*
+** test1:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Test a for loop format of decrementing to zero */
+int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
+void test2 (int32_t *b, int num_elems)
+{
+    for (int i = num_elems; i > 0; i-= 4)
+    {
+        mve_pred16_t p = vctp32q (i);
+        int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
+        vstrwq_p_s32 (b + i, va, p);
+    }
+}
+/*
+** test2:
+**...
+**	dlstp.32	lr, r1
+**...
+**	vldrw.32	(q[0-9]+), \[r3\], #-16
+**	vstrw.32	\1, \[r0\], #-16
+**	letp	lr, .*
+**...
+*/
+
+/* Iteration counter counting up to num_iter.  */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = 0; i < num_iter; i++)
+    {
+	mve_pred16_t p = vctp8q (n);
+	uint8x16_t va = vldrbq_z_u8 (a, p);
+	uint8x16_t vb = vldrbq_z_u8 (b, p);
+	uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+	vstrbq_p_u8 (c, vc, p);
+	n-=16;
+	a += 16;
+	b += 16;
+	c += 16;
+    }
+}
+
+/*
+** test3:
+**...
+**	dlstp.8	lr, r3
+**...
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrb.8	\3, \[(r[0-9]+|ip)\]
+**...
+**	letp	lr, .*
+**...
+*/
+
+/* Iteration counter counting down from num_iter.  */
+void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = num_iter; i > 0; i--)
+    {
+	mve_pred16_t p = vctp8q (n);
+	uint8x16_t va = vldrbq_z_u8 (a, p);
+	uint8x16_t vb = vldrbq_z_u8 (b, p);
+	uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+	vstrbq_p_u8 (c, vc, p);
+	n-=16;
+	a += 16;
+	b += 16;
+	c += 16;
+    }
+}
+/*
+** test4:
+**...
+**	dlstp.8	lr, r3
+**...
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrb.8	\3, \[(r[0-9]+|ip)\]
+**...
+**	letp	lr, .*
+**...
+*/
+
+/* Using an unpredicated arithmetic instruction within the loop.  */
+void test5 (uint8_t *a, uint8_t *b, uint8_t *c,  uint8_t *d, int n)
+{
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        uint8x16_t vb = vldrbq_u8 (b);
+	/* Is affected by implicit predication, because vb also
+	came from an unpredicated load, but there is no functional
+	problem, because the result is used in a predicated store.  */
+        uint8x16_t vc = vaddq_u8 (va, vb);
+        uint8x16_t vd = vaddq_x_u8 (va, vb, p);
+        vstrbq_p_u8 (c, vc, p);
+        vstrbq_p_u8 (d, vd, p);
+        n-=16;
+	a += 16;
+	b += 16;
+	c += 16;
+    }
+}
+
+/*
+** test5:
+**...
+**	dlstp.8	lr, r[0-9]+
+**...
+**	vldrb.8	q[0-9]+, \[r1\]
+**	vldrb.8	q[0-9]+, \[r2\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**...
+**	vstrb.8	\1, \[r2\]
+**	vstrb.8	\1, \[r3\]
+**	letp	lr, .*
+**...
+*/
+
+/* Using a different VPR value for one instruction in the loop.  */
+void test6 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test6:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using another VPR value in the loop, with a vctp.
+   The doloop logic will always try to do the transform on the first
+   vctp it encounters, so this is still expected to work.  */
+void test7 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      mve_pred16_t p1 = vctp32q (g);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+/*
+** test7:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vctp,
+   but this time the p1 will also change in every loop (still fine)  */
+void test8 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      mve_pred16_t p1 = vctp32q (g);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+      g++;
+    }
+}
+
+/*
+** test8:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vctp.32	r4
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**...
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vctp_m
+   that is independent of the loop vctp VPR.  */
+void test9 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      mve_pred16_t p2 = vctp32q_m (n, p1);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test9:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vmsr	p0, (r[0-9]+)	@ movhi
+**	vpst
+**	vctpt.32	r3
+**	vmrs	(r[0-9]+), p0	@ movhi
+**	vmsr	p0, \1	@ movhi
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**	vmsr	p0, \2	@ movhi
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**...
+**	vstrw.32	\3, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop,
+   with a vctp_m that is tied to the base vctp VPR.  This
+   is still fine, because the vctp_m will be transformed
+   into a vctp and be implicitly predicated.  */
+void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      mve_pred16_t p1 = vctp32q_m (n, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p1);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+/*
+   We don't need that extra vctp in the loop, but we currently do not optimize
+   it away, however, it is not wrong to use it...
+*/
+/*
+** test10:
+**...
+**	dlstp.32	lr, r3
+**	vctp.32	r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**...
+**	vpst
+**	vldrwt.32	q[0-9]+, \[r1\], #16
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp.  */
+void test11 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      mve_pred16_t p1 = vcmpeqq_s32 (va, vb);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p1);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test11:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vcmp.i32	eq, q[0-9]+, q[0-9]+
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp_m.  */
+void test12 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test12:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vmsr	p0, (r[0-9]+|ip)	@ movhi
+**	vpst
+**	vcmpt.i32	eq, q[0-9]+, q[0-9]+
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\2, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp_m
+   that is tied to the base vctp VPR (same as above, this will be turned
+   into a vcmp and be implicitly predicated).  */
+void test13 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test13:
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vcmp.i32	eq, q[0-9]+, q[0-9]+
+**	vpst
+**	vaddt.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
+/* Similar to test27 in dsltp-invalid-asm.c, but use a predicated load to make
+   it safe to implicitly predicate the vaddv.  */
+void test14 (int32_t *a, int32_t *c, int n)
+{
+  int32_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      res += vaddvq_s32 (va);
+      int32x4_t vc = vdupq_n_s32 (res);
+      vstrwq_p_s32 (c, vc, p);
+      a += 4;
+      n -= 4;
+    }
+}
+
+/*
+** test14:
+**...
+**	dlstp.32	lr, r2
+**	vldrw.32	(q[0-9]+), \[r0\], #16
+**	vaddv.s32	(r[0-9]+|ip), \1
+**	add	(r[0-9]+|ip), \3, \2
+**	vdup.32	(q[0-9]+), \3
+**	vstrw.32	\4, \[r1\]
+**	letp	lr, .*
+**...
+*/
+
+uint8_t test15 (uint8_t *a, uint8_t *b, int n)
+{
+    uint8_t res = 0;
+    uint8x16_t vc = vdupq_n_u8 (0);
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       vc = vaddq_m (vc, va, vc, p);
+       res = vgetq_lane (vc, 5);
+
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return res;
+}
+
+/*
+** test15:
+**...
+**	dlstp.8	lr, r2
+**...
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**...
+**	letp	lr, .*
+**	vmov.u8	r[0-9]+, \2\[5\]
+**...
+*/
+
+uint8_t test16 (uint8_t *a, uint8_t *b, int n)
+{
+    uint8_t res = 0;
+    uint8x16_t vc = vdupq_n_u8 (0);
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       vc = vaddq (va, vc);
+       vc = vaddq_m (vc, va, vc, p);
+       res = vgetq_lane (vc, 5);
+
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return res;
+}
+
+/*
+** test16:
+**...
+**	dlstp.8	lr, r2
+**...
+**	vldrb.8	q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+**	vadd.i8	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vadd.i8	\2, q[0-9]+, q[0-9]+
+**	letp	lr, .*
+**	vmov.u8	r[0-9]+, \2\[5\]
+**...
+*/
+
+
+
+/* Using an across-vector unpredicated instruction in a valid way.
+   This tests that "vc" has correctly masked the risky "vb".  */
+uint16_t test18 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16x8_t vb = vldrhq_u16 (b);
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
+      res += vaddvq_u16 (vc);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/*
+** test18:
+**...
+**	dlstp.16	lr, r3
+**	vldrh.16	(q[0-9]+), \[r2\], #16
+**	vadd.i16	\1, q[0-9]+, q[0-9]+
+**	vaddv.u16	(r[0-9]+|ip), \1
+**	add	(r[0-9]+|ip), \3, \2
+**	uxth	\3, \3
+**	letp	lr, .*
+**...
+*/
+
+/* Using an across-vector unpredicated instruction with implicit scalar adding from outside the loop.  */
+uint16_t test19 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16x8_t vb = vldrhq_u16 (b);
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
+      res = vaddvaq_u16 (res, vc);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/*
+** test19:
+**...
+**	dlstp.16	lr, r3
+**	vldrh.16	(q[0-9]+), \[r2\], #16
+**	vadd.i16	\1, q[0-9]+, q[0-9]+
+**	vaddva.u16	(r[0-9]+|ip), \1
+**	uxth	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+
+/* Using an across-vector predicated instruction in a valid way.  */
+uint16_t  test20 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_u16 (a);
+      res = vaddvaq_p_u16 (res, va, p);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/* The uxth could be moved outside the loop.  */
+/*
+** test20:
+**...
+**	dlstp.16	lr, r3
+**	vldrh.16	(q[0-9]+), \[r2\], #16
+**	vaddva.u16	(r[0-9]+|ip), \1
+**	uxth	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+/* Using an across-vector predicated instruction in a valid way.  */
+uint16_t  test21 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_u16 (a);
+      res++;
+      res = vaddvaq_p_u16 (res, va, p);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/* Also think it'd be safe to move uxth outside of the loop here.  */
+/*
+** test21:
+**...
+**	dlstp.16	lr, r3
+**	vldrh.16	(q[0-9]+), \[r2\], #16
+**	adds	(r[0-9]+|ip), \2, #1
+**	uxth	\2, \2
+**	vaddva.u16	\2, \1
+**	uxth	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+int test22 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        res = vmaxvq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+/*
+** test22:
+**...
+**	dlstp.8	lr, r3
+**...
+**	vldrb.8	(q[0-9]+), \[r[0-9]+\]
+**...
+**	vmaxv.u8	(r[0-9]+|ip), \1
+**	uxtb	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+int test23 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vmaxavq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+/*
+** test23:
+**...
+**	dlstp.8	lr, r3
+**...
+**	vldrb.8	(q[0-9]+), \[r3\]
+**...
+**	vmaxav.s8	(r[0-9]+|ip), \1
+**	uxtb	\2, \2
+**	letp	lr, .*
+**...
+*/
+
+/* Like test1, but update n before vctp, meaning we should only iterate for n-4
+   elements.  */
+void test24 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n >= 1)
+    {
+      n-=4;
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+    }
+}
+/*
+** test24:
+**...
+**	subs	r3, r3, #4
+**...
+**	dlstp.32	lr, r3
+**	vldrw.32	q[0-9]+, \[r0\], #16
+**	vldrw.32	q[0-9]+, \[r1\], #16
+**	vadd.i32	(q[0-9]+), q[0-9]+, q[0-9]+
+**	vstrw.32	\1, \[r2\], #16
+**	letp	lr, .*
+**...
+*/
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
new file mode 100644
index 00000000000..c784f540131
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
@@ -0,0 +1,46 @@
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+
+/* We don't support pattern recognition of signed N values when computing num_iter.  */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = 0; i < num_iter; i++)
+    {
+	mve_pred16_t p = vctp8q (n);
+	uint8x16_t va = vldrbq_z_u8 (a, p);
+	uint8x16_t vb = vldrbq_z_u8 (b, p);
+	uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+	vstrbq_p_u8 (c, vc, p);
+	n-=16;
+	a += 16;
+	b += 16;
+	c += 16;
+    }
+}
+
+/* Using a predicated vcmp to generate a new predicate value in the
+   loop and then using it in a predicated store insn.  */
+void test17 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_s32 (va, vb);
+      mve_pred16_t p1 = vcmpeqq_m_s32 (va, vc, p);
+      vstrwq_p_s32 (c, vc, p1);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+/* This is an example of a loop that we could tail predicate but currently don't.  */
+/* { dg-final { scan-assembler "letp" { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
new file mode 100644
index 00000000000..6966a396604
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+#include "dlstp-int16x8.c"
+
+int main ()
+{
+  int i;
+  int16_t temp1[N];
+  int16_t temp2[N];
+  int16_t temp3[N];
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 0);
+  check_plus16 (temp1, temp2, temp3, 0);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 1);
+  check_plus16 (temp1, temp2, temp3, 1);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 7);
+  check_plus16 (temp1, temp2, temp3, 7);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 8);
+  check_plus16 (temp1, temp2, temp3, 8);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 9);
+  check_plus16 (temp1, temp2, temp3, 9);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 16);
+  check_plus16 (temp1, temp2, temp3, 16);
+
+  reset_data16 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 17);
+  check_plus16 (temp1, temp2, temp3, 17);
+
+  reset_data16 (temp1, temp2, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
new file mode 100644
index 00000000000..33632c5f14d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void  __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      int16x8_t va = vldrhq_z_s16 (a, p);
+      int16x8_t vb = vldrhq_z_s16 (b, p);
+      int16x8_t vc = vaddq_x_s16 (va, vb, p);
+      vstrhq_p_s16 (c, vc, p);
+      c+=8;
+      a+=8;
+      b+=8;
+      n-=8;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.16} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c
new file mode 100644
index 00000000000..6833dddde92
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int32x4.c"
+
+int main ()
+{
+  int i;
+  int32_t temp1[N];
+  int32_t temp2[N];
+  int32_t temp3[N];
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 0);
+  check_plus32 (temp1, temp2, temp3, 0);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 1);
+  check_plus32 (temp1, temp2, temp3, 1);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 3);
+  check_plus32 (temp1, temp2, temp3, 3);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 4);
+  check_plus32 (temp1, temp2, temp3, 4);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 5);
+  check_plus32 (temp1, temp2, temp3, 5);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 8);
+  check_plus32 (temp1, temp2, temp3, 8);
+
+  reset_data32 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 9);
+  check_plus32 (temp1, temp2, temp3, 9);
+
+  reset_data32 (temp1, temp2, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c
new file mode 100644
index 00000000000..5d09f784b77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void  __attribute__ ((noinline)) test (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.32} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c
new file mode 100644
index 00000000000..cc0b9ce7ee9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int64x2.c"
+
+int main ()
+{
+  int i;
+  int64_t temp1[N];
+  int64_t temp3[N];
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 0);
+  check_memcpy64 (temp1, temp3, 0);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 1);
+  check_memcpy64 (temp1, temp3, 1);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 2);
+  check_memcpy64 (temp1, temp3, 2);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 3);
+  check_memcpy64 (temp1, temp3, 3);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 4);
+  check_memcpy64 (temp1, temp3, 4);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 5);
+  check_memcpy64 (temp1, temp3, 5);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 6);
+  check_memcpy64 (temp1, temp3, 6);
+
+  reset_data64  (temp1, temp3, N);
+  test (temp1, temp3, 7);
+  check_memcpy64 (temp1, temp3, 7);
+
+  reset_data64  (temp1, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c
new file mode 100644
index 00000000000..21e882424ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c
@@ -0,0 +1,28 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void  __attribute__ ((noinline)) test (int64_t *a, int64_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp64q (n);
+      int64x2_t va = vldrdq_gather_offset_z_s64 (a, vcreateq_u64 (0, 8), p);
+      vstrdq_scatter_offset_p_s64 (c, vcreateq_u64 (0, 8), va, p);
+      c+=2;
+      a+=2;
+      n-=2;
+    }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.64} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c
new file mode 100644
index 00000000000..d46571f329c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int8x16.c"
+
+int main ()
+{
+  int i;
+  int8_t temp1[N];
+  int8_t temp2[N];
+  int8_t temp3[N];
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 0);
+  check_plus8 (temp1, temp2, temp3, 0);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 1);
+  check_plus8 (temp1, temp2, temp3, 1);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 15);
+  check_plus8 (temp1, temp2, temp3, 15);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 16);
+  check_plus8 (temp1, temp2, temp3, 16);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 17);
+  check_plus8 (temp1, temp2, temp3, 17);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 32);
+  check_plus8 (temp1, temp2, temp3, 32);
+
+  reset_data8 (temp1, temp2, temp3, N);
+  test (temp1, temp2, temp3, 33);
+  check_plus8 (temp1, temp2, temp3, 33);
+
+  reset_data8 (temp1, temp2, temp3, N);
+}
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c
new file mode 100644
index 00000000000..d5f22b50262
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void  __attribute__ ((noinline)) test (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp8q (n);
+      int8x16_t va = vldrbq_z_s8 (a, p);
+      int8x16_t vb = vldrbq_z_s8 (b, p);
+      int8x16_t vc = vaddq_x_s8 (va, vb, p);
+      vstrbq_p_s8 (c, vc, p);
+      c+=16;
+      a+=16;
+      b+=16;
+      n-=16;
+    }
+}
+
+
+/* { dg-final { scan-assembler-times {\tdlstp.8} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
new file mode 100644
index 00000000000..26df2d30523
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
@@ -0,0 +1,521 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <limits.h>
+#include <arm_mve.h>
+
+/* Terminating on a non-zero number of elements.  */
+void test0 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    while (n > 1)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+       vstrbq_p_u8 (c, vc, p);
+       n -= 16;
+    }
+}
+
+/* Terminating on n >= 0.  */
+void test1 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    while (n >= 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+       vstrbq_p_u8 (c, vc, p);
+       n -= 16;
+    }
+}
+
+/* Similar, terminating on a non-zero number of elements, but in a for loop
+   format.  */
+int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
+void test2 (int32_t *b, int num_elems)
+{
+    for (int i = num_elems; i >= 2; i-= 4)
+    {
+       mve_pred16_t p = vctp32q (i);
+       int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
+       vstrwq_p_s32 (b + i, va, p);
+    }
+}
+
+/* Iteration counter counting up to num_iter, with a non-zero starting num.  */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = 1; i < num_iter; i++)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+       vstrbq_p_u8 (c, vc, p);
+       n -= 16;
+    }
+}
+
+/* Iteration counter counting up to num_iter, with a larger increment  */
+void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int num_iter = (n + 15)/16;
+    for (int i = 0; i < num_iter; i+=2)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+       vstrbq_p_u8 (c, vc, p);
+       n -= 16;
+    }
+}
+
+/* Using an unpredicated store instruction within the loop.  */
+void test5 (uint8_t *a, uint8_t *b, uint8_t *c,  uint8_t *d, int n)
+{
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_u8 (va, vb);
+       uint8x16_t vd = vaddq_x_u8 (va, vb, p);
+       vstrbq_u8 (d, vd);
+       n -= 16;
+    }
+}
+
+/* Using an unpredicated store outside the loop.  */
+void test6 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+       vx = vaddq_u8 (vx, vc);
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    vstrbq_u8 (c, vx);
+}
+
+/* Using a VPR that gets modified within the loop.  */
+void test9 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      p++;
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using a VPR that gets re-generated within the loop.  */
+void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  mve_pred16_t p = vctp32q (n);
+  while (n > 0)
+    {
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      p = vctp32q (n);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using vctp32q_m instead of vctp32q.  */
+void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q_m (n, p0);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using an unpredicated op with a scalar output, where the result is valid
+   outside the bb.  This is invalid, because one of the inputs to the
+   unpredicated op is also unpredicated.  */
+uint8_t test12 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+    uint8_t sum = 0;
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       uint8x16_t vc = vaddq_u8 (va, vb);
+       sum += vaddvq_u8 (vc);
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return sum;
+}
+
+/* Using an unpredicated vcmp to generate a new predicate value in the
+   loop and then using that VPR to predicate a store insn.  */
+void test13 (int32_t *a, int32_t *b, int32x4_t vc, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_s32 (a);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_s32 (va, vb);
+      mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
+      vstrwq_p_s32 (c, vc, p1);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using an across-vector unpredicated instruction. "vb" is the risk.  */
+uint16_t test14 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16x8_t vb = vldrhq_u16 (b);
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      vb = vaddq_u16 (va, vb);
+      res = vaddvq_u16 (vb);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+/* Using an across-vector unpredicated instruction. "vc" is the risk. */
+uint16_t test15 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16x8_t vb = vldrhq_u16 (b);
+  uint16_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      uint16x8_t vc = vaddq_u16 (va, vb);
+      res = vaddvaq_u16 (res, vc);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+uint16_t test16 (uint16_t *a, uint16_t *b,  uint16_t *c, int n)
+{
+  uint16_t res =0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp16q (n);
+      uint16x8_t vb = vldrhq_u16 (b);
+      uint16x8_t va = vldrhq_z_u16 (a, p);
+      res = vaddvaq_u16 (res, vb);
+      res = vaddvaq_p_u16 (res, va, p);
+      c += 8;
+      a += 8;
+      b += 8;
+      n -= 8;
+    }
+  return res;
+}
+
+int test17 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vmaxvq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+
+
+int test18 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vminvq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+int test19 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vminavq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+int test20 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+    int res = 0;
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        res = vminvq (res, va);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+uint8x16_t test21 (uint8_t *a, uint32_t *b, int n, uint8x16_t res)
+{
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        res = vshlcq_u8 (va, b, 1);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+int8x16_t test22 (int8_t *a, int32_t *b, int n, int8x16_t res)
+{
+    while (n > 0)
+    {
+        mve_pred16_t p = vctp8q (n);
+        int8x16_t va = vldrbq_z_s8 (a, p);
+        res = vshlcq_s8 (va, b, 1);
+        n-=16;
+        a+=16;
+    }
+    return res;
+}
+
+/* Using an unsigned number of elements to count down from, with a >0*/
+void test23 (int32_t *a, int32_t *b, int32_t *c, unsigned int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+
+/* Using an unsigned number of elements to count up to, with a <n*/
+void test24 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned int n)
+{
+    for (int i = 0; i < n; i+=16)
+    {
+        mve_pred16_t p = vctp8q (n-i);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        uint8x16_t vb = vldrbq_z_u8 (b, p);
+        uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+        vstrbq_p_u8 (c, vc, p);
+        n-=16;
+    }
+}
+
+
+/* Using an unsigned number of elements to count up to, with a <=n*/
+void test25 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned int n)
+{
+    for (int i = 1; i <= n; i+=16)
+    {
+        mve_pred16_t p = vctp8q (n-i+1);
+        uint8x16_t va = vldrbq_z_u8 (a, p);
+        uint8x16_t vb = vldrbq_z_u8 (b, p);
+        uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+        vstrbq_p_u8 (c, vc, p);
+        n-=16;
+    }
+}
+/* Update n twice in the loop.  */
+void test26 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n >= 1)
+    {
+      n-=4;
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_z_s32 (b, p);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      vstrwq_p_s32 (c, vc, p);
+      c+=4;
+      a+=4;
+      b+=4;
+      n-=4;
+    }
+}
+
+void test27 (int32_t *a, int32_t *c, int n)
+{
+  int32_t res = 0;
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_s32 (a);
+      res += vaddvq_s32 (va);
+      int32x4_t vc = vdupq_n_s32 (res);
+      vstrwq_p_s32 (c, vc, p);
+      a += 4;
+      n -= 4;
+    }
+}
+
+/* Using an unpredicated vcmp to generate a new predicate value in the
+   loop and then using it in a predicated store insn.  */
+void test28 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+  while (n > 0)
+    {
+      mve_pred16_t p = vctp32q (n);
+      int32x4_t va = vldrwq_z_s32 (a, p);
+      int32x4_t vb = vldrwq_s32 (b);
+      int32x4_t vc = vaddq_x_s32 (va, vb, p);
+      mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
+      vstrwq_p_s32 (c, vc, p1);
+      c += 4;
+      a += 4;
+      b += 4;
+      n -= 4;
+    }
+}
+
+/* Using an unpredicated op with a scalar output, where the result is valid
+   outside the bb.  The unpredicated lanes are not guaranteed zero, so would
+   affect the vaddv in the non-tail predicated case.  */
+uint8_t test29 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+    uint8_t sum = 0;
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+       sum += vaddvq_u8 (vc);
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return sum;
+}
+
+/* Same as above, but with another scalar op between the unpredicated op and
+   the scalar op outside the loop.  */
+uint8_t test30 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx, int g)
+{
+    uint8_t sum = 0;
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_z_u8 (b, p);
+       uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+       sum += vaddvq_u8 (vc);
+       sum += g;
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return sum;
+}
+
+uint8_t test31 (uint8_t *a, uint8_t *b, int n)
+{
+    uint8_t res = 0;
+    uint8x16_t vc = vdupq_n_u8 (0);
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       vc = vaddq (vb, vc);
+       res = vgetq_lane (vc, 5);
+
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return res;
+}
+
+uint8_t test32 (uint8_t *a, uint8_t *b, int n)
+{
+    uint8_t res = 0;
+    uint8x16_t vc = vdupq_n_u8 (0);
+    while (n > 0)
+    {
+       mve_pred16_t p = vctp8q (n);
+       uint8x16_t va = vldrbq_z_u8 (a, p);
+       uint8x16_t vb = vldrbq_u8 (b);
+       vc = vaddq_m (vc, va, vc, p);
+       vc = vaddq (vb, vc);
+       res = vgetq_lane (vc, 5);
+
+       a += 16;
+       b += 16;
+       n -= 16;
+    }
+    return res;
+}
+
+/* { dg-final { scan-assembler-not "\tdlstp" } } */
+/* { dg-final { scan-assembler-not "\tletp" } } */
author	Andre Vieira <andre.simoesdiasvieira@arm.com>	2024-06-19 17:05:55 +0100
committer	Andre Vieira <andre.simoesdiasvieira@arm.com>	2024-06-19 17:11:17 +0100
commit	3dfc28dbbd21b1d708aa40064380ef4c42c994d7 (patch)
tree	340ee5cc055032837da2506ebaa808675f63eaad
parent	5d0c1b4e0d33c2d1077264636d0a65ce206d0d96 (diff)