aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndre Vieira <andre.simoesdiasvieira@arm.com>2024-06-19 17:05:55 +0100
committerAndre Vieira <andre.simoesdiasvieira@arm.com>2024-06-19 17:11:17 +0100
commit3dfc28dbbd21b1d708aa40064380ef4c42c994d7 (patch)
tree340ee5cc055032837da2506ebaa808675f63eaad
parent5d0c1b4e0d33c2d1077264636d0a65ce206d0d96 (diff)
arm: Add support for MVE Tail-Predicated Low Overhead Loops
This patch adds support for MVE Tail-Predicated Low Overhead Loops by using the doloop funcitonality added to support predicated vectorized hardware loops. gcc/ChangeLog: * config/arm/arm-protos.h (arm_target_bb_ok_for_lob): Change declaration to pass basic_block. (arm_attempt_dlstp_transform): New declaration. * config/arm/arm.cc (TARGET_LOOP_UNROLL_ADJUST): Define targethook. (TARGET_PREDICT_DOLOOP_P): Likewise. (arm_target_bb_ok_for_lob): Adapt condition. (arm_mve_get_vctp_lanes): New function. (arm_dl_usage_type): New internal enum. (arm_get_required_vpr_reg): New function. (arm_get_required_vpr_reg_param): New function. (arm_get_required_vpr_reg_ret_val): New function. (arm_mve_get_loop_vctp): New function. (arm_mve_insn_predicated_by): New function. (arm_mve_across_lane_insn_p): New function. (arm_mve_load_store_insn_p): New function. (arm_mve_impl_pred_on_outputs_p): New function. (arm_mve_impl_pred_on_inputs_p): New function. (arm_last_vect_def_insn): New function. (arm_mve_impl_predicated_p): New function. (arm_mve_check_reg_origin_is_num_elems): New function. (arm_mve_dlstp_check_inc_counter): New function. (arm_mve_dlstp_check_dec_counter): New function. (arm_mve_loop_valid_for_dlstp): New function. (arm_predict_doloop_p): New function. (arm_loop_unroll_adjust): New function. (arm_emit_mve_unpredicated_insn_to_seq): New function. (arm_attempt_dlstp_transform): New function. * config/arm/arm.opt (mdlstp): New option. * config/arm/iterators.md (dlstp_elemsize, letp_num_lanes, letp_num_lanes_neg, letp_num_lanes_minus_1): New attributes. (DLSTP, LETP): New iterators. * config/arm/mve.md (predicated_doloop_end_internal<letp_num_lanes>, dlstp<dlstp_elemsize>_insn): New insn patterns. * config/arm/thumb2.md (doloop_end): Adapt to support tail-predicated loops. (doloop_begin): Likewise. * config/arm/types.md (mve_misc): New mve type to represent predicated_loop_end insn sequences. * config/arm/unspecs.md: (DLSTP8, DLSTP16, DLSTP32, DSLTP64, LETP8, LETP16, LETP32, LETP64): New unspecs for DLSTP and LETP. gcc/testsuite/ChangeLog: * gcc.target/arm/lob.h: Add new helpers. * gcc.target/arm/lob1.c: Use new helpers. * gcc.target/arm/lob6.c: Likewise. * gcc.target/arm/mve/dlstp-compile-asm-1.c: New test. * gcc.target/arm/mve/dlstp-compile-asm-2.c: New test. * gcc.target/arm/mve/dlstp-compile-asm-3.c: New test. * gcc.target/arm/mve/dlstp-int8x16.c: New test. * gcc.target/arm/mve/dlstp-int8x16-run.c: New test. * gcc.target/arm/mve/dlstp-int16x8.c: New test. * gcc.target/arm/mve/dlstp-int16x8-run.c: New test. * gcc.target/arm/mve/dlstp-int32x4.c: New test. * gcc.target/arm/mve/dlstp-int32x4-run.c: New test. * gcc.target/arm/mve/dlstp-int64x2.c: New test. * gcc.target/arm/mve/dlstp-int64x2-run.c: New test. * gcc.target/arm/mve/dlstp-invalid-asm.c: New test. Co-authored-by: Stam Markianos-Wright <stam.markianos-wright@arm.com>
-rw-r--r--gcc/config/arm/arm-protos.h4
-rw-r--r--gcc/config/arm/arm.cc1249
-rw-r--r--gcc/config/arm/arm.opt3
-rw-r--r--gcc/config/arm/iterators.md15
-rw-r--r--gcc/config/arm/mve.md50
-rw-r--r--gcc/config/arm/thumb2.md138
-rw-r--r--gcc/config/arm/types.md6
-rw-r--r--gcc/config/arm/unspecs.md14
-rw-r--r--gcc/testsuite/gcc.target/arm/lob.h128
-rw-r--r--gcc/testsuite/gcc.target/arm/lob1.c23
-rw-r--r--gcc/testsuite/gcc.target/arm/lob6.c8
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c146
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c749
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c46
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c44
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c31
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c45
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c31
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c48
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c28
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c44
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c32
-rw-r--r--gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c521
23 files changed, 3321 insertions, 82 deletions
diff --git a/gcc/config/arm/arm-protos.h b/gcc/config/arm/arm-protos.h
index 2cd560c9925..34d6be76e94 100644
--- a/gcc/config/arm/arm-protos.h
+++ b/gcc/config/arm/arm-protos.h
@@ -65,8 +65,8 @@ extern void arm_emit_speculation_barrier_function (void);
extern void arm_decompose_di_binop (rtx, rtx, rtx *, rtx *, rtx *, rtx *);
extern bool arm_q_bit_access (void);
extern bool arm_ge_bits_access (void);
-extern bool arm_target_insn_ok_for_lob (rtx);
-
+extern bool arm_target_bb_ok_for_lob (basic_block);
+extern int arm_attempt_dlstp_transform (rtx);
#ifdef RTX_CODE
enum reg_class
arm_mode_base_reg_class (machine_mode);
diff --git a/gcc/config/arm/arm.cc b/gcc/config/arm/arm.cc
index b8c32db0a1d..7d67d2cfee9 100644
--- a/gcc/config/arm/arm.cc
+++ b/gcc/config/arm/arm.cc
@@ -668,6 +668,12 @@ static const scoped_attribute_specs *const arm_attribute_table[] =
#undef TARGET_HAVE_CONDITIONAL_EXECUTION
#define TARGET_HAVE_CONDITIONAL_EXECUTION arm_have_conditional_execution
+#undef TARGET_LOOP_UNROLL_ADJUST
+#define TARGET_LOOP_UNROLL_ADJUST arm_loop_unroll_adjust
+
+#undef TARGET_PREDICT_DOLOOP_P
+#define TARGET_PREDICT_DOLOOP_P arm_predict_doloop_p
+
#undef TARGET_LEGITIMATE_CONSTANT_P
#define TARGET_LEGITIMATE_CONSTANT_P arm_legitimate_constant_p
@@ -34659,19 +34665,1236 @@ arm_invalid_within_doloop (const rtx_insn *insn)
}
bool
-arm_target_insn_ok_for_lob (rtx insn)
-{
- basic_block bb = BLOCK_FOR_INSN (insn);
- /* Make sure the basic block of the target insn is a simple latch
- having as single predecessor and successor the body of the loop
- itself. Only simple loops with a single basic block as body are
- supported for 'low over head loop' making sure that LE target is
- above LE itself in the generated code. */
-
- return single_succ_p (bb)
- && single_pred_p (bb)
- && single_succ_edge (bb)->dest == single_pred_edge (bb)->src
- && contains_no_active_insn_p (bb);
+arm_target_bb_ok_for_lob (basic_block bb)
+{
+ /* Make sure the basic block is a simple latch having as the single
+ predecessor and successor the body of the loop itself.
+ Only simple loops with a single basic block as body are supported for
+ low over head loops, making sure that LE target is above LE instruction
+ in the generated code. */
+ return (single_succ_p (bb)
+ && single_pred_p (bb)
+ && single_succ_edge (bb)->dest == single_pred_edge (bb)->src);
+}
+
+/* Utility fuction: Given a VCTP or a VCTP_M insn, return the number of MVE
+ lanes based on the machine mode being used. */
+
+static int
+arm_mve_get_vctp_lanes (rtx_insn *insn)
+{
+ rtx insn_set = single_set (insn);
+ if (insn_set
+ && GET_CODE (SET_SRC (insn_set)) == UNSPEC
+ && (XINT (SET_SRC (insn_set), 1) == VCTP
+ || XINT (SET_SRC (insn_set), 1) == VCTP_M))
+ {
+ machine_mode mode = GET_MODE (SET_SRC (insn_set));
+ return ((VECTOR_MODE_P (mode) && VALID_MVE_PRED_MODE (mode))
+ ? GET_MODE_NUNITS (mode) : 0);
+ }
+ return 0;
+}
+
+enum arm_dl_usage_type { DL_USAGE_ANY = 0,
+ DL_USAGE_READ = 1,
+ DL_USAGE_WRITE = 2 };
+
+/* Check if INSN requires the use of the VPR reg, if it does, return the
+ sub-rtx of the VPR reg. The TYPE argument controls whether
+ this function should:
+ * For TYPE == DL_USAGE_ANY, check all operands, including the OUT operands,
+ and return the first occurrence of the VPR reg.
+ * For TYPE == DL_USAGE_READ, only check the input operands.
+ * For TYPE == DL_USAGE_WRITE, only check the output operands.
+ (INOUT operands are considered both as input and output operands)
+*/
+static rtx
+arm_get_required_vpr_reg (rtx_insn *insn,
+ arm_dl_usage_type type = DL_USAGE_ANY)
+{
+ gcc_assert (type < 3);
+ if (!NONJUMP_INSN_P (insn))
+ return NULL_RTX;
+
+ bool requires_vpr;
+ extract_constrain_insn (insn);
+ int n_operands = recog_data.n_operands;
+ if (recog_data.n_alternatives == 0)
+ return NULL_RTX;
+
+ /* Fill in recog_op_alt with information about the constraints of
+ this insn. */
+ preprocess_constraints (insn);
+
+ for (int op = 0; op < n_operands; op++)
+ {
+ requires_vpr = true;
+ if (type == DL_USAGE_READ
+ && recog_data.operand_type[op] == OP_OUT)
+ continue;
+ else if (type == DL_USAGE_WRITE
+ && recog_data.operand_type[op] == OP_IN)
+ continue;
+
+ /* Iterate through alternatives of operand "op" in recog_op_alt and
+ identify if the operand is required to be the VPR. */
+ for (int alt = 0; alt < recog_data.n_alternatives; alt++)
+ {
+ const operand_alternative *op_alt
+ = &recog_op_alt[alt * n_operands];
+ /* Fetch the reg_class for each entry and check it against the
+ VPR_REG reg_class. */
+ if (alternative_class (op_alt, op) != VPR_REG)
+ requires_vpr = false;
+ }
+ /* If all alternatives of the insn require the VPR reg for this operand,
+ it means that either this is VPR-generating instruction, like a vctp,
+ vcmp, etc., or it is a VPT-predicated insruction. Return the subrtx
+ of the VPR reg operand. */
+ if (requires_vpr)
+ return recog_data.operand[op];
+ }
+ return NULL_RTX;
+}
+
+/* Wrapper function of arm_get_required_vpr_reg with TYPE == DL_USAGE_READ,
+ so return the VPR only if it is an input operand to the insn. */
+
+static rtx
+arm_get_required_vpr_reg_param (rtx_insn *insn)
+{
+ return arm_get_required_vpr_reg (insn, DL_USAGE_READ);
+}
+
+/* Wrapper function of arm_get_required_vpr_reg with TYPE == DL_USAGE_WRITE,
+ so return the VPR only if it is the return value, an output of, or is
+ clobbered by the insn. */
+
+static rtx
+arm_get_required_vpr_reg_ret_val (rtx_insn *insn)
+{
+ return arm_get_required_vpr_reg (insn, DL_USAGE_WRITE);
+}
+
+/* Return the first VCTP instruction in BB, if it exists, or NULL otherwise. */
+
+static rtx_insn *
+arm_mve_get_loop_vctp (basic_block bb)
+{
+ rtx_insn *insn = BB_HEAD (bb);
+
+ /* Now scan through all the instruction patterns and pick out the VCTP
+ instruction. We require arm_get_required_vpr_reg_param to be false
+ to make sure we pick up a VCTP, rather than a VCTP_M. */
+ FOR_BB_INSNS (bb, insn)
+ if (NONDEBUG_INSN_P (insn))
+ if (arm_get_required_vpr_reg_ret_val (insn)
+ && (arm_mve_get_vctp_lanes (insn) != 0)
+ && !arm_get_required_vpr_reg_param (insn))
+ return insn;
+ return NULL;
+}
+
+/* Return true if INSN is a MVE instruction that is VPT-predicable and is
+ predicated on VPR_REG. */
+
+static bool
+arm_mve_insn_predicated_by (rtx_insn *insn, rtx vpr_reg)
+{
+ rtx insn_vpr_reg_operand = (MVE_VPT_PREDICATED_INSN_P (insn)
+ ? arm_get_required_vpr_reg_param (insn)
+ : NULL_RTX);
+ return (insn_vpr_reg_operand
+ && rtx_equal_p (vpr_reg, insn_vpr_reg_operand));
+}
+
+/* Utility function to identify if INSN is an MVE instruction that performs
+ some across lane operation (and as a result does not align with normal
+ lane predication rules). All such instructions give one only scalar
+ output, except for vshlcq which gives a PARALLEL of a vector and a scalar
+ (one vector result and one carry output). */
+
+static bool
+arm_mve_across_lane_insn_p (rtx_insn* insn)
+{
+ df_ref insn_defs = NULL;
+ if (!MVE_VPT_PREDICABLE_INSN_P (insn))
+ return false;
+
+ FOR_EACH_INSN_DEF (insn_defs, insn)
+ if (!VALID_MVE_MODE (GET_MODE (DF_REF_REG (insn_defs)))
+ && !arm_get_required_vpr_reg_ret_val (insn))
+ return true;
+
+ return false;
+}
+
+/* Utility function to identify if INSN is an MVE load or store instruction.
+ * For TYPE == DL_USAGE_ANY, check all operands. If the function returns
+ true, INSN is a load or a store insn.
+ * For TYPE == DL_USAGE_READ, only check the input operands. If the
+ function returns true, INSN is a load insn.
+ * For TYPE == DL_USAGE_WRITE, only check the output operands. If the
+ function returns true, INSN is a store insn. */
+
+static bool
+arm_mve_load_store_insn_p (rtx_insn* insn,
+ arm_dl_usage_type type = DL_USAGE_ANY)
+{
+ gcc_assert (type < 3);
+ int n_operands = recog_data.n_operands;
+ extract_insn (insn);
+
+ for (int op = 0; op < n_operands; op++)
+ {
+ if (type == DL_USAGE_READ && recog_data.operand_type[op] == OP_OUT)
+ continue;
+ else if (type == DL_USAGE_WRITE && recog_data.operand_type[op] == OP_IN)
+ continue;
+ if (mve_memory_operand (recog_data.operand[op],
+ GET_MODE (recog_data.operand[op])))
+ return true;
+ }
+ return false;
+}
+
+/* Return TRUE if INSN is validated for implicit predication by how its outputs
+ are used.
+
+ If INSN is a MVE operation across lanes that is not predicated by
+ VCTP_VPR_GENERATED it can not be validated by the use of its ouputs.
+
+ Any other INSN is safe to implicit predicate if we don't use its outputs
+ outside the loop. The instructions that use this INSN's outputs will be
+ validated as we go through the analysis. */
+
+static bool
+arm_mve_impl_pred_on_outputs_p (rtx_insn *insn, rtx vctp_vpr_generated)
+{
+ /* Reject any unpredicated across lane operation. */
+ if (!arm_mve_insn_predicated_by (insn, vctp_vpr_generated)
+ && arm_mve_across_lane_insn_p (insn))
+ return false;
+
+ /* Next, scan forward to the various USEs of the DEFs in this insn. */
+ df_ref insn_def = NULL;
+ basic_block insn_bb = BLOCK_FOR_INSN (insn);
+ FOR_EACH_INSN_DEF (insn_def, insn)
+ {
+ for (df_ref use = DF_REG_USE_CHAIN (DF_REF_REGNO (insn_def));
+ use;
+ use = DF_REF_NEXT_REG (use))
+ {
+ rtx_insn *next_use_insn = DF_REF_INSN (use);
+ if (!INSN_P (next_use_insn) || DEBUG_INSN_P (next_use_insn))
+ continue;
+
+ if (insn_bb != BLOCK_FOR_INSN (next_use_insn))
+ return false;
+ }
+ }
+ return true;
+}
+
+
+/* Returns the prevailing definition of OP before CUR_INSN in the same
+ basic block as CUR_INSN, if one exists, returns NULL otherwise. */
+
+static rtx_insn*
+arm_last_vect_def_insn (rtx op, rtx_insn *cur_insn)
+{
+ if (!REG_P (op)
+ || !BLOCK_FOR_INSN (cur_insn))
+ return NULL;
+
+ df_ref def_insns;
+ rtx_insn *last_def = NULL;
+ for (def_insns = DF_REG_DEF_CHAIN (REGNO (op));
+ def_insns;
+ def_insns = DF_REF_NEXT_REG (def_insns))
+ {
+ rtx_insn *def_insn = DF_REF_INSN (def_insns);
+ /* Definition not in the loop body or after the current insn. */
+ if (DF_REF_BB (def_insns) != BLOCK_FOR_INSN (cur_insn)
+ || INSN_UID (def_insn) >= INSN_UID (cur_insn))
+ continue;
+
+ if (!last_def || INSN_UID (def_insn) > INSN_UID (last_def))
+ last_def = def_insn;
+ }
+ return last_def;
+}
+
+
+/* This function returns TRUE if we can validate the implicit predication of
+ INSN_IN with VCTP_VPR_GENERATED based on the definition of the instruction's
+ input operands.
+
+ If INSN_IN is a MVE operation across lanes then all of its MVE vector
+ operands must have its tail-predicated lanes be zeroes. We keep track of any
+ instructions that define vector operands for which this is true in
+ PROPS_ZERO_SET.
+
+ For any other INSN_IN, the definition of all its operands must be defined
+ inside the loop body by an instruction that comes before INSN_IN and not be
+ a MVE load predicated by a different VPR. These instructions have all been
+ validated for explicit or implicit predication.
+ */
+
+static bool
+arm_mve_impl_pred_on_inputs_p (vec <rtx_insn *> *props_zero_set,
+ rtx_insn *insn_in, rtx vctp_vpr_generated)
+{
+ /* If all inputs come from instructions that are explicitly or
+ implicitly predicated by the same predicate then it is safe to
+ implicitly predicate this instruction. */
+ df_ref insn_uses = NULL;
+ bool across_lane = arm_mve_across_lane_insn_p (insn_in);
+ FOR_EACH_INSN_USE (insn_uses, insn_in)
+ {
+ rtx op = DF_REF_REG (insn_uses);
+ rtx_insn *def_insn = arm_last_vect_def_insn (op, insn_in);
+ if (across_lane)
+ {
+ if (!VALID_MVE_MODE (GET_MODE (op)))
+ continue;
+ if (!def_insn || !props_zero_set->contains (def_insn))
+ return false;
+
+ continue;
+ }
+
+ if (!def_insn
+ || (!arm_mve_insn_predicated_by (def_insn, vctp_vpr_generated)
+ && arm_mve_load_store_insn_p (def_insn, DL_USAGE_READ)))
+ return false;
+ }
+
+ return true;
+}
+
+
+/* Determine whether INSN_IN is safe to implicitly predicate based on the type
+ of instruction and where needed the definition of its inputs and the uses of
+ its outputs.
+ Return TRUE if it is safe to implicitly predicate and FALSE otherwise.
+
+ * If INSN_IN is a store, then it is always unsafe to implicitly predicate it.
+ * If INSN_IN is a load, only reject implicit predication if its uses
+ directly invalidate it.
+ * If INSN_IN operates across vector lanes and does not have the
+ "mve_safe_imp_xlane_pred" attribute, then it is always unsafe to implicitly
+ predicate.
+ * If INSN_IN operates on Floating Point elements and we are not compiling
+ with -Ofast, then it is unsafe to implicitly predicate it as we may be
+ changing exception and cumulative bits behaviour.
+ * If INSN_IN is a VCTP instruction, then it is safe to implicitly predicate,
+ but instructions that use this predicate will need to be checked
+ just like any other UNPREDICATED MVE instruction.
+ * Otherwise check if INSN_IN's inputs or uses of outputs can validate its
+ implicit predication.
+
+ * If all inputs come from instructions that are explicitly or implicitly
+ predicated by the same predicate then it is safe to implicitly predicate
+ this instruction.
+ * If INSN_IN is an operation across lanes with the "mve_safe_imp_xlane_pred"
+ attribute, then all it's operands must have zeroed falsely predicated tail
+ lanes.
+
+ * Otherwise, check if the implicit predication of INSN_IN can be validated
+ based on its inputs, and if not check whether it can be validated based on
+ how its outputs are used. */
+
+static bool
+arm_mve_impl_predicated_p (vec <rtx_insn *> *props_zero_set,
+ rtx_insn *insn_in, rtx vctp_vpr_generated)
+{
+
+ /* If INSN_IN is a store, then it is always unsafe to implicitly
+ predicate it. */
+ if (arm_mve_load_store_insn_p (insn_in, DL_USAGE_WRITE))
+ return false;
+
+ /* If INSN_IN is a load, only reject implicit predication if its uses
+ directly invalidate it. */
+ if (arm_mve_load_store_insn_p (insn_in, DL_USAGE_READ))
+ {
+ if (!arm_mve_impl_pred_on_outputs_p (insn_in, vctp_vpr_generated))
+ return false;
+ return true;
+ }
+
+ /* If INSN_IN operates across vector lanes and does not have the
+ "mve_safe_imp_xlane_pred" attribute, then it is always unsafe to implicitly
+ predicate. */
+ if (arm_mve_across_lane_insn_p (insn_in)
+ && (get_attr_mve_safe_imp_xlane_pred (insn_in)
+ != MVE_SAFE_IMP_XLANE_PRED_YES))
+ return false;
+
+ /* If INSN_IN operates on Floating Point elements and we are not compiling
+ with -Ofast, then it is unsafe to implicitly predicate it as we may be
+ changing exception and cumulative bits behaviour. */
+ if (!flag_unsafe_math_optimizations
+ && flag_trapping_math
+ && MVE_VPT_UNPREDICATED_INSN_P (insn_in))
+ {
+ df_ref def;
+ FOR_EACH_INSN_DEF (def, insn_in)
+ if (DF_REF_TYPE (def) == DF_REF_REG_DEF
+ && FLOAT_MODE_P (GET_MODE (DF_REF_REG (def))))
+ return false;
+ FOR_EACH_INSN_USE (def, insn_in)
+ if (DF_REF_TYPE (def) == DF_REF_REG_DEF
+ && FLOAT_MODE_P (GET_MODE (DF_REF_REG (def))))
+ return false;
+ }
+
+ /* If INSN_IN is a VCTP instruction, then it is safe to implicitly predicate,
+ but instructions that use this predicate will need to be checked
+ just like any other UNPREDICATED MVE instruction. */
+ if (arm_get_required_vpr_reg_ret_val (insn_in)
+ && (arm_mve_get_vctp_lanes (insn_in) != 0))
+ return true;
+
+ /* Otherwise, check if the implicit predication of INSN_IN can be validated
+ based on its inputs, and if not check whether it can be validated based on
+ how its outputs are used. */
+ return (arm_mve_impl_pred_on_inputs_p (props_zero_set, insn_in, vctp_vpr_generated)
+ || arm_mve_impl_pred_on_outputs_p (insn_in, vctp_vpr_generated));
+}
+
+/* Helper function to `arm_mve_dlstp_check_inc_counter` and to
+ `arm_mve_dlstp_check_dec_counter`. In the situations where the loop counter
+ is incrementing by 1 or decrementing by 1 in each iteration, ensure that the
+ number of iterations, the value of REG, going into the loop, was calculated
+ as:
+ REG = (N + [1, VCTP_STEP - 1]) / VCTP_STEP
+
+ where N is equivalent to the VCTP_REG.
+*/
+
+static bool
+arm_mve_check_reg_origin_is_num_elems (loop *loop, rtx reg, rtx vctp_step,
+ rtx vctp_reg)
+{
+ df_ref counter_max_last_def = NULL;
+
+ /* More than one reaching definition. */
+ if (DF_REG_DEF_COUNT (REGNO (reg)) > 2)
+ return false;
+
+ /* Look for a single defition of REG going into the loop. The DEF_CHAIN will
+ have at least two values, as this is a loop induction variable that is
+ defined outside the loop. */
+ for (df_ref def = DF_REG_DEF_CHAIN (REGNO (reg));
+ def;
+ def = DF_REF_NEXT_REG (def))
+ {
+ /* Skip the update inside the loop, this has already been checked by the
+ iv_analyze call earlier. */
+ if (DF_REF_BB (def) == loop->header)
+ continue;
+
+ counter_max_last_def = def;
+ break;
+ }
+
+ if (!counter_max_last_def)
+ return false;
+
+ rtx counter_max_last_set = single_set (DF_REF_INSN (counter_max_last_def));
+
+ if (!counter_max_last_set)
+ return false;
+
+ /* If we encounter a simple SET from a REG, follow it through. */
+ if (REG_P (SET_SRC (counter_max_last_set)))
+ {
+ if (DF_REG_DEF_COUNT (REGNO (SET_SRC (counter_max_last_set))) != 1)
+ return false;
+
+ counter_max_last_def
+ = DF_REG_DEF_CHAIN (REGNO (SET_SRC (counter_max_last_set)));
+ counter_max_last_set
+ = single_set (DF_REF_INSN (counter_max_last_def));
+
+ if (!counter_max_last_set)
+ return false;
+ }
+
+ /* We are looking for:
+ COUNTER_MAX_LAST_SET = (N + VCTP_STEP - 1) / VCTP_STEP.
+ We currently only support the unsigned VCTP_OP case. */
+ rtx division = SET_SRC (counter_max_last_set);
+ if (GET_CODE (division) != LSHIFTRT)
+ return false;
+
+ /* Now check that we are dividing by VCTP_STEP, i.e. the number of lanes. */
+ rtx divisor = XEXP (division, 1);
+ unsigned vctp_step_cst = abs_hwi (INTVAL (vctp_step));
+ if (!CONST_INT_P (divisor)
+ || (1U << INTVAL (divisor) != vctp_step_cst))
+ return false;
+
+ rtx dividend = XEXP (division, 0);
+ if (!REG_P (dividend))
+ /* Subreg? */
+ return false;
+
+ /* For now only support the simple case, this only works for unsigned N, any
+ signed N will have further computations to deal with overflow. */
+ if (DF_REG_DEF_COUNT (REGNO (dividend)) != 1)
+ return false;
+
+ rtx_insn *dividend_insn = DF_REF_INSN (DF_REG_DEF_CHAIN (REGNO (dividend)));
+ rtx dividend_op = single_set (dividend_insn);
+ if (!dividend_op
+ && GET_CODE (SET_SRC (dividend_op)) != PLUS)
+ return false;
+
+ /* Check if PLUS_OP is (VCTP_OP + VAL), where VAL = [1, VCTP_STEP - 1]. */
+ rtx plus_op = SET_SRC (dividend_op);
+ if (!REG_P (XEXP (plus_op, 0))
+ || !CONST_INT_P (XEXP (plus_op, 1))
+ || !IN_RANGE (INTVAL (XEXP (plus_op, 1)), 1, vctp_step_cst - 1))
+ return false;
+
+ /* VCTP_REG may have been copied before entering the loop, let's see if we can
+ trace such a copy back. If we have more than one reaching definition then
+ bail out as analysis will be too difficult. */
+ if (DF_REG_DEF_COUNT (REGNO (vctp_reg)) > 2)
+ return false;
+
+ /* Look for the definition of N. */
+ for (df_ref def = DF_REG_DEF_CHAIN (REGNO (vctp_reg));
+ def;
+ def = DF_REF_NEXT_REG (def))
+ {
+ if (DF_REF_BB (def) == loop->header)
+ continue;
+ rtx set = single_set (DF_REF_INSN (def));
+ if (set
+ && REG_P (SET_SRC (set))
+ && !HARD_REGISTER_P (SET_SRC (set)))
+ vctp_reg = SET_SRC (set);
+ }
+
+ return rtx_equal_p (vctp_reg, XEXP (plus_op, 0));
+}
+
+/* If we have identified the loop to have an incrementing counter, we need to
+ make sure that it increments by 1 and that the loop is structured correctly:
+ * The counter starts from 0
+ * The counter terminates at (num_of_elem + num_of_lanes - 1) / num_of_lanes
+ * The vctp insn uses a reg that decrements appropriately in each iteration.
+*/
+
+static rtx_insn*
+arm_mve_dlstp_check_inc_counter (loop *loop, rtx_insn* vctp_insn,
+ rtx condconst, rtx condcount)
+{
+ rtx vctp_reg = XVECEXP (XEXP (PATTERN (vctp_insn), 1), 0, 0);
+ /* The loop latch has to be empty. When compiling all the known MVE LoLs in
+ user applications, none of those with incrementing counters had any real
+ insns in the loop latch. As such, this function has only been tested with
+ an empty latch and may misbehave or ICE if we somehow get here with an
+ increment in the latch, so, for correctness, error out early. */
+ if (!empty_block_p (loop->latch))
+ return NULL;
+
+ class rtx_iv vctp_reg_iv;
+ /* For loops of DLSTP_TYPE_B, the loop counter is independent of the decrement
+ of the reg used in the vctp_insn. So run iv analysis on that reg. This
+ has to succeed for such loops to be supported. */
+ if (!iv_analyze (vctp_insn, as_a<scalar_int_mode> (GET_MODE (vctp_reg)),
+ vctp_reg, &vctp_reg_iv))
+ return NULL;
+
+ /* Extract the decrementnum of the vctp reg from the iv. This decrementnum
+ is the number of lanes/elements it decrements from the remaining number of
+ lanes/elements to process in the loop, for this reason this is always a
+ negative number, but to simplify later checks we use it's absolute value. */
+ HOST_WIDE_INT decrementnum = INTVAL (vctp_reg_iv.step);
+ if (decrementnum >= 0)
+ return NULL;
+ decrementnum = abs_hwi (decrementnum);
+
+ /* Find where both of those are modified in the loop header bb. */
+ df_ref condcount_reg_set_df = df_bb_regno_only_def_find (loop->header,
+ REGNO (condcount));
+ df_ref vctp_reg_set_df = df_bb_regno_only_def_find (loop->header,
+ REGNO (vctp_reg));
+ if (!condcount_reg_set_df || !vctp_reg_set_df)
+ return NULL;
+ rtx condcount_reg_set = single_set (DF_REF_INSN (condcount_reg_set_df));
+ rtx vctp_reg_set = single_set (DF_REF_INSN (vctp_reg_set_df));
+ if (!condcount_reg_set || !vctp_reg_set)
+ return NULL;
+
+ /* Ensure the modification of the vctp reg from df is consistent with
+ the iv and the number of lanes on the vctp insn. */
+ if (GET_CODE (SET_SRC (vctp_reg_set)) != PLUS
+ || !REG_P (SET_DEST (vctp_reg_set))
+ || !REG_P (XEXP (SET_SRC (vctp_reg_set), 0))
+ || REGNO (SET_DEST (vctp_reg_set))
+ != REGNO (XEXP (SET_SRC (vctp_reg_set), 0))
+ || !CONST_INT_P (XEXP (SET_SRC (vctp_reg_set), 1))
+ || INTVAL (XEXP (SET_SRC (vctp_reg_set), 1)) >= 0
+ || decrementnum != abs_hwi (INTVAL (XEXP (SET_SRC (vctp_reg_set), 1)))
+ || decrementnum != arm_mve_get_vctp_lanes (vctp_insn))
+ return NULL;
+
+ if (REG_P (condcount) && REG_P (condconst))
+ {
+ /* First we need to prove that the loop is going 0..condconst with an
+ inc of 1 in each iteration. */
+ if (GET_CODE (SET_SRC (condcount_reg_set)) == PLUS
+ && CONST_INT_P (XEXP (SET_SRC (condcount_reg_set), 1))
+ && INTVAL (XEXP (SET_SRC (condcount_reg_set), 1)) == 1)
+ {
+ rtx counter_reg = SET_DEST (condcount_reg_set);
+ /* Check that the counter did indeed start from zero. */
+ df_ref this_set = DF_REG_DEF_CHAIN (REGNO (counter_reg));
+ if (!this_set)
+ return NULL;
+ df_ref last_set_def = DF_REF_NEXT_REG (this_set);
+ if (!last_set_def)
+ return NULL;
+ rtx_insn* last_set_insn = DF_REF_INSN (last_set_def);
+ rtx last_set = single_set (last_set_insn);
+ if (!last_set)
+ return NULL;
+ rtx counter_orig_set;
+ counter_orig_set = SET_SRC (last_set);
+ if (!CONST_INT_P (counter_orig_set)
+ || (INTVAL (counter_orig_set) != 0))
+ return NULL;
+ /* And finally check that the target value of the counter,
+ condconst, is of the correct shape. */
+ if (!arm_mve_check_reg_origin_is_num_elems (loop, condconst,
+ vctp_reg_iv.step,
+ vctp_reg))
+ return NULL;
+ }
+ else
+ return NULL;
+ }
+ else
+ return NULL;
+
+ /* Everything looks valid. */
+ return vctp_insn;
+}
+
+/* Helper function to `arm_mve_loop_valid_for_dlstp`. In the case of a
+ counter that is decrementing, ensure that it is decrementing by the
+ right amount in each iteration and that the target condition is what
+ we expect. */
+
+static rtx_insn*
+arm_mve_dlstp_check_dec_counter (loop *loop, rtx_insn* vctp_insn,
+ rtx condconst, rtx condcount)
+{
+ rtx vctp_reg = XVECEXP (XEXP (PATTERN (vctp_insn), 1), 0, 0);
+ class rtx_iv vctp_reg_iv;
+ HOST_WIDE_INT decrementnum;
+ /* For decrementing loops of DLSTP_TYPE_A, the counter is usually present in the
+ loop latch. Here we simply need to verify that this counter is the same
+ reg that is also used in the vctp_insn and that it is not otherwise
+ modified. */
+ rtx_insn *dec_insn = BB_END (loop->latch);
+ /* If not in the loop latch, try to find the decrement in the loop header. */
+ if (!NONDEBUG_INSN_P (dec_insn))
+ {
+ df_ref temp = df_bb_regno_only_def_find (loop->header, REGNO (condcount));
+ /* If we haven't been able to find the decrement, bail out. */
+ if (!temp)
+ return NULL;
+ dec_insn = DF_REF_INSN (temp);
+ }
+
+ rtx dec_set = single_set (dec_insn);
+
+ /* Next, ensure that it is a PLUS of the form:
+ (set (reg a) (plus (reg a) (const_int)))
+ where (reg a) is the same as condcount. */
+ if (!dec_set
+ || !REG_P (SET_DEST (dec_set))
+ || !REG_P (XEXP (SET_SRC (dec_set), 0))
+ || !CONST_INT_P (XEXP (SET_SRC (dec_set), 1))
+ || REGNO (SET_DEST (dec_set))
+ != REGNO (XEXP (SET_SRC (dec_set), 0))
+ || REGNO (SET_DEST (dec_set)) != REGNO (condcount))
+ return NULL;
+
+ decrementnum = INTVAL (XEXP (SET_SRC (dec_set), 1));
+
+ /* This decrementnum is the number of lanes/elements it decrements from the
+ remaining number of lanes/elements to process in the loop, for this reason
+ this is always a negative number, but to simplify later checks we use its
+ absolute value. */
+ if (decrementnum >= 0)
+ return NULL;
+ decrementnum = -decrementnum;
+
+ /* If the decrementnum is a 1, then we need to look at the loop vctp_reg and
+ verify that it also decrements correctly.
+ Then, we need to establish that the starting value of the loop decrement
+ originates from the starting value of the vctp decrement. */
+ if (decrementnum == 1)
+ {
+ class rtx_iv vctp_reg_iv, condcount_reg_iv;
+ /* The loop counter is found to be independent of the decrement
+ of the reg used in the vctp_insn, again. Ensure that IV analysis
+ succeeds and check the step. */
+ if (!iv_analyze (vctp_insn, as_a<scalar_int_mode> (GET_MODE (vctp_reg)),
+ vctp_reg, &vctp_reg_iv))
+ return NULL;
+ /* Ensure it matches the number of lanes of the vctp instruction. */
+ if (abs (INTVAL (vctp_reg_iv.step))
+ != arm_mve_get_vctp_lanes (vctp_insn))
+ return NULL;
+
+ if (!arm_mve_check_reg_origin_is_num_elems (loop, condcount,
+ vctp_reg_iv.step,
+ vctp_reg))
+ return NULL;
+ }
+ /* If the decrements are the same, then the situation is simple: either they
+ are also the same reg, which is safe, or they are different registers, in
+ which case makse sure that there is a only simple SET from one to the
+ other inside the loop.*/
+ else if (decrementnum == arm_mve_get_vctp_lanes (vctp_insn))
+ {
+ if (REGNO (condcount) != REGNO (vctp_reg))
+ {
+ /* It wasn't the same reg, but it could be behild a
+ (set (vctp_reg) (condcount)), so instead find where
+ the VCTP insn is DEF'd inside the loop. */
+ rtx_insn *vctp_reg_insn
+ = DF_REF_INSN (df_bb_regno_only_def_find (loop->header,
+ REGNO (vctp_reg)));
+ rtx vctp_reg_set = single_set (vctp_reg_insn);
+ /* This must just be a simple SET from the condcount. */
+ if (!vctp_reg_set
+ || !REG_P (SET_DEST (vctp_reg_set))
+ || !REG_P (SET_SRC (vctp_reg_set))
+ || REGNO (SET_SRC (vctp_reg_set)) != REGNO (condcount))
+ return NULL;
+ }
+ }
+ else
+ return NULL;
+
+ /* We now only need to find out that the loop terminates with a LE
+ zero condition. If condconst is a const_int, then this is easy.
+ If its a REG, look at the last condition+jump in a bb before
+ the loop, because that usually will have a branch jumping over
+ the loop header. */
+ rtx_insn *jump_insn = BB_END (loop->header);
+ if (CONST_INT_P (condconst)
+ && !(INTVAL (condconst) == 0 && JUMP_P (jump_insn)
+ && GET_CODE (XEXP (PATTERN (jump_insn), 1)) == IF_THEN_ELSE
+ && (GET_CODE (XEXP (XEXP (PATTERN (jump_insn), 1), 0)) == NE
+ ||GET_CODE (XEXP (XEXP (PATTERN (jump_insn), 1), 0)) == GT)))
+ return NULL;
+ else if (REG_P (condconst))
+ {
+ basic_block pre_loop_bb = single_pred (loop_preheader_edge (loop)->src);
+ if (!pre_loop_bb)
+ return NULL;
+
+ rtx initial_compare = NULL_RTX;
+ if (!(prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb))
+ && INSN_P (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb)))))
+ return NULL;
+ else
+ initial_compare
+ = single_set (prev_nonnote_nondebug_insn_bb (BB_END (pre_loop_bb)));
+ if (!(initial_compare
+ && cc_register (SET_DEST (initial_compare), VOIDmode)
+ && GET_CODE (SET_SRC (initial_compare)) == COMPARE
+ && CONST_INT_P (XEXP (SET_SRC (initial_compare), 1))
+ && INTVAL (XEXP (SET_SRC (initial_compare), 1)) == 0))
+ return NULL;
+
+ /* Usually this is a LE condition, but it can also just be a GT or an EQ
+ condition (if the value is unsigned or the compiler knows its not negative) */
+ rtx_insn *loop_jumpover = BB_END (pre_loop_bb);
+ if (!(JUMP_P (loop_jumpover)
+ && GET_CODE (XEXP (PATTERN (loop_jumpover), 1)) == IF_THEN_ELSE
+ && (GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == LE
+ || GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == GT
+ || GET_CODE (XEXP (XEXP (PATTERN (loop_jumpover), 1), 0)) == EQ)))
+ return NULL;
+ }
+
+ /* Everything looks valid. */
+ return vctp_insn;
+}
+
+/* Function to check a loop's structure to see if it is a valid candidate for
+ an MVE Tail Predicated Low-Overhead Loop. Returns the loop's VCTP_INSN if
+ it is valid, or NULL if it isn't. */
+
+static rtx_insn*
+arm_mve_loop_valid_for_dlstp (loop *loop)
+{
+ /* Doloop can only be done "elementwise" with predicated dlstp/letp if it
+ contains a VCTP on the number of elements processed by the loop.
+ Find the VCTP predicate generation inside the loop body BB. */
+ rtx_insn *vctp_insn = arm_mve_get_loop_vctp (loop->header);
+ if (!vctp_insn)
+ return NULL;
+
+ /* We only support two loop forms for tail predication:
+ DLSTP_TYPE_A) Loops of the form:
+ int num_of_lanes = 128 / elem_size;
+ while (num_of_elem > 0)
+ {
+ p = vctp<size> (num_of_elem);
+ num_of_elem -= num_of_lanes;
+ }
+ DLSTP_TYPE_B) Loops of the form:
+ int num_of_lanes = 128 / elem_size;
+ int num_of_iters = (num_of_elem + num_of_lanes - 1) / num_of_lanes;
+ for (i = 0; i < num_of_iters; i++)
+ {
+ p = vctp<size> (num_of_elem);
+ num_of_elem -= num_of_lanes;
+ }
+
+ Then, depending on the type of loop above we need will need to do
+ different sets of checks. */
+ iv_analysis_loop_init (loop);
+
+ /* In order to find out if the loop is of DLSTP_TYPE_A or DLSTP_TYPE_B above
+ look for the loop counter: it will either be incrementing by one per
+ iteration or it will be decrementing by num_of_lanes. We can find the
+ loop counter in the condition at the end of the loop. */
+ rtx_insn *loop_cond = prev_nonnote_nondebug_insn_bb (BB_END (loop->header));
+ if (!(cc_register (XEXP (PATTERN (loop_cond), 0), VOIDmode)
+ && GET_CODE (XEXP (PATTERN (loop_cond), 1)) == COMPARE))
+ return NULL;
+
+ /* The operands in the condition: Try to identify which one is the
+ constant and which is the counter and run IV analysis on the latter. */
+ rtx cond_arg_1 = XEXP (XEXP (PATTERN (loop_cond), 1), 0);
+ rtx cond_arg_2 = XEXP (XEXP (PATTERN (loop_cond), 1), 1);
+
+ rtx loop_cond_constant;
+ rtx loop_counter;
+ class rtx_iv cond_counter_iv, cond_temp_iv;
+
+ if (CONST_INT_P (cond_arg_1))
+ {
+ /* cond_arg_1 is the constant and cond_arg_2 is the counter. */
+ loop_cond_constant = cond_arg_1;
+ loop_counter = cond_arg_2;
+ iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_2)),
+ cond_arg_2, &cond_counter_iv);
+ }
+ else if (CONST_INT_P (cond_arg_2))
+ {
+ /* cond_arg_2 is the constant and cond_arg_1 is the counter. */
+ loop_cond_constant = cond_arg_2;
+ loop_counter = cond_arg_1;
+ iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_1)),
+ cond_arg_1, &cond_counter_iv);
+ }
+ else if (REG_P (cond_arg_1) && REG_P (cond_arg_2))
+ {
+ /* If both operands to the compare are REGs, we can safely
+ run IV analysis on both and then determine which is the
+ constant by looking at the step.
+ First assume cond_arg_1 is the counter. */
+ loop_counter = cond_arg_1;
+ loop_cond_constant = cond_arg_2;
+ iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_1)),
+ cond_arg_1, &cond_counter_iv);
+ iv_analyze (loop_cond, as_a<scalar_int_mode> (GET_MODE (cond_arg_2)),
+ cond_arg_2, &cond_temp_iv);
+
+ /* Look at the steps and swap around the rtx's if needed. Error out if
+ one of them cannot be identified as constant. */
+ if (!CONST_INT_P (cond_counter_iv.step) || !CONST_INT_P (cond_temp_iv.step))
+ return NULL;
+ if (INTVAL (cond_counter_iv.step) != 0 && INTVAL (cond_temp_iv.step) != 0)
+ return NULL;
+ if (INTVAL (cond_counter_iv.step) == 0 && INTVAL (cond_temp_iv.step) != 0)
+ {
+ loop_counter = cond_arg_2;
+ loop_cond_constant = cond_arg_1;
+ cond_counter_iv = cond_temp_iv;
+ }
+ }
+ else
+ return NULL;
+
+ if (!REG_P (loop_counter))
+ return NULL;
+ if (!(REG_P (loop_cond_constant) || CONST_INT_P (loop_cond_constant)))
+ return NULL;
+
+ /* Now we have extracted the IV step of the loop counter, call the
+ appropriate checking function. */
+ if (INTVAL (cond_counter_iv.step) > 0)
+ return arm_mve_dlstp_check_inc_counter (loop, vctp_insn,
+ loop_cond_constant, loop_counter);
+ else if (INTVAL (cond_counter_iv.step) < 0)
+ return arm_mve_dlstp_check_dec_counter (loop, vctp_insn,
+ loop_cond_constant, loop_counter);
+ else
+ return NULL;
+}
+
+/* Predict whether the given loop in gimple will be transformed in the RTL
+ doloop_optimize pass. It could be argued that turning large enough loops
+ into low-overhead loops would not show a signficant performance boost.
+ However, in the case of tail predication we would still avoid using VPT/VPST
+ instructions inside the loop, and in either case using low-overhead loops
+ would not be detrimental, so we decided to not consider size, avoiding the
+ need of a heuristic to determine what an appropriate size boundary is. */
+
+static bool
+arm_predict_doloop_p (struct loop *loop)
+{
+ gcc_assert (loop);
+ /* On arm, targetm.can_use_doloop_p is actually
+ can_use_doloop_if_innermost. Ensure the loop is innermost,
+ it is valid and as per arm_target_bb_ok_for_lob and the
+ correct architecture flags are enabled. */
+ if (!(TARGET_HAVE_LOB && optimize > 0))
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Predict doloop failure due to"
+ " target architecture or optimisation flags.\n");
+ return false;
+ }
+ else if (loop->inner != NULL)
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Predict doloop failure due to"
+ " loop nesting.\n");
+ return false;
+ }
+ else if (!arm_target_bb_ok_for_lob (loop->header->next_bb))
+ {
+ if (dump_file && (dump_flags & TDF_DETAILS))
+ fprintf (dump_file, "Predict doloop failure due to"
+ " loop bb complexity.\n");
+ return false;
+ }
+
+ return true;
+}
+
+/* Implement targetm.loop_unroll_adjust. Use this to block unrolling of loops
+ that may later be turned into MVE Tail Predicated Low Overhead Loops. The
+ performance benefit of an MVE LoL is likely to be much higher than that of
+ the unrolling. */
+
+unsigned
+arm_loop_unroll_adjust (unsigned nunroll, struct loop *loop)
+{
+ if (TARGET_HAVE_MVE
+ && arm_target_bb_ok_for_lob (loop->latch)
+ && arm_mve_loop_valid_for_dlstp (loop))
+ return 0;
+ else
+ return nunroll;
+}
+
+/* Function to hadle emitting a VPT-unpredicated version of a VPT-predicated
+ insn to a sequence. */
+
+static bool
+arm_emit_mve_unpredicated_insn_to_seq (rtx_insn* insn)
+{
+ rtx insn_vpr_reg_operand = arm_get_required_vpr_reg_param (insn);
+ int new_icode = get_attr_mve_unpredicated_insn (insn);
+ if (!in_sequence_p ()
+ || !MVE_VPT_PREDICATED_INSN_P (insn)
+ || (!insn_vpr_reg_operand)
+ || (!new_icode))
+ return false;
+
+ extract_insn (insn);
+ rtx arr[8];
+ int j = 0;
+
+ /* When transforming a VPT-predicated instruction into its unpredicated
+ equivalent we need to drop the VPR operand and we may need to also drop a
+ merge "vuninit" input operand, depending on the instruction pattern. Here
+ ensure that we have at most a two-operand difference between the two
+ instrunctions. */
+ int n_operands_diff
+ = recog_data.n_operands - insn_data[new_icode].n_operands;
+ if (!(n_operands_diff > 0 && n_operands_diff <= 2))
+ return false;
+
+ rtx move = NULL_RTX;
+ /* Then, loop through the operands of the predicated
+ instruction, and retain the ones that map to the
+ unpredicated instruction. */
+ for (int i = 0; i < recog_data.n_operands; i++)
+ {
+ /* Ignore the VPR and, if needed, the vuninit
+ operand. */
+ if (insn_vpr_reg_operand == recog_data.operand[i])
+ continue;
+ if (n_operands_diff == 2
+ && !strcmp (recog_data.constraints[i], "0"))
+ {
+ move = gen_rtx_SET (arr[0], recog_data.operand[i]);
+ arr[0] = recog_data.operand[i];
+ }
+ else
+ arr[j++] = recog_data.operand[i];
+ }
+
+ /* Finally, emit the upredicated instruction. */
+ rtx_insn *new_insn;
+ switch (j)
+ {
+ case 1:
+ new_insn = emit_insn (GEN_FCN (new_icode) (arr[0]));
+ break;
+ case 2:
+ new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1]));
+ break;
+ case 3:
+ new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2]));
+ break;
+ case 4:
+ new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+ arr[3]));
+ break;
+ case 5:
+ new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+ arr[3], arr[4]));
+ break;
+ case 6:
+ new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+ arr[3], arr[4], arr[5]));
+ break;
+ case 7:
+ new_insn = emit_insn (GEN_FCN (new_icode) (arr[0], arr[1], arr[2],
+ arr[3], arr[4], arr[5],
+ arr[6]));
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ INSN_LOCATION (new_insn) = INSN_LOCATION (insn);
+ if (move)
+ {
+ new_insn = emit_insn (move);
+ INSN_LOCATION (new_insn) = INSN_LOCATION (insn);
+ }
+ return true;
+}
+
+/* Return TRUE if INSN defines a MVE vector operand that has zeroed
+ tail-predicated lanes. This is either true if:
+ * INSN is predicated by VCTP_VPR_GENERATED and the 'invalid lanes' operand
+ is in the PROPS_ZERO_SET,
+ * all MVE vector operands are in the PROPS_ZERO_SET
+*/
+
+static bool
+arm_mve_propagate_zero_pred_p (vec <rtx_insn *> *props_zero_set,
+ rtx_insn *insn, rtx vctp_vpr_generated)
+{
+ if (arm_mve_load_store_insn_p (insn, DL_USAGE_READ))
+ return true;
+ if (arm_mve_load_store_insn_p (insn, DL_USAGE_WRITE))
+ return false;
+
+ int inactive_idx = -1;
+
+ extract_insn (insn);
+ /* If INSN is predicated by VCTP_VPR_GENERATED, then all tail-predicated
+ lanes will keep the value that is in the 'invalid lanes' register which we
+ identify by the "0" constraint, to ensure it is the same as the 'result'
+ register of this instruction. */
+ if (arm_mve_insn_predicated_by (insn, vctp_vpr_generated))
+ {
+ for (int i = 0; i < recog_data.n_operands; i++)
+ {
+ if (strcmp (recog_data.constraints[i], "0") == 0
+ && VALID_MVE_MODE (GET_MODE (recog_data.operand[i])))
+ {
+ inactive_idx = i;
+ break;
+ }
+ }
+ }
+
+ if (inactive_idx > 0)
+ {
+ rtx op = recog_data.operand[inactive_idx];
+ rtx_insn *def_insn = arm_last_vect_def_insn (op, insn);
+ return def_insn != NULL_RTX && props_zero_set->contains (def_insn);
+ }
+
+ /* If this instruction is not predicated by VCTP_VPR_GENERATED, then we must
+ check that all vector operands have zeroed tail-predicated lanes, and that
+ it has at least one vector operand. */
+ bool at_least_one_vector = false;
+ df_ref insn_uses;
+ FOR_EACH_INSN_USE (insn_uses, insn)
+ {
+ rtx reg = DF_REF_REG (insn_uses);
+ if (!VALID_MVE_MODE (GET_MODE (reg)))
+ continue;
+
+ rtx_insn *def_insn = arm_last_vect_def_insn (reg, insn);
+ if (def_insn && props_zero_set->contains (def_insn))
+ at_least_one_vector |= true;
+ else
+ return false;
+
+ }
+ return at_least_one_vector;
+}
+
+
+/* Attempt to transform the loop contents of loop basic block from VPT
+ predicated insns into unpredicated insns for a dlstp/letp loop. Returns
+ the number to decrement from the total number of elements each iteration.
+ Returns 1 if tail predication can not be performed and fallback to scalar
+ low-overhead loops. */
+
+int
+arm_attempt_dlstp_transform (rtx label)
+{
+ if (!dlstp_enabled)
+ return 1;
+
+ basic_block body = single_succ (BLOCK_FOR_INSN (label));
+
+ /* Ensure that the bb is within a loop that has all required metadata. */
+ if (!body->loop_father || !body->loop_father->header
+ || !body->loop_father->simple_loop_desc)
+ return 1;
+
+ loop *loop = body->loop_father;
+ /* Instruction that sets the predicate mask depending on how many elements
+ are left to process. */
+ rtx_insn *vctp_insn = arm_mve_loop_valid_for_dlstp (loop);
+ if (!vctp_insn)
+ return 1;
+
+ gcc_assert (single_set (vctp_insn));
+
+ rtx vctp_vpr_generated = single_set (vctp_insn);
+ if (!vctp_vpr_generated)
+ return 1;
+
+ vctp_vpr_generated = SET_DEST (vctp_vpr_generated);
+
+ if (!vctp_vpr_generated || !REG_P (vctp_vpr_generated)
+ || !VALID_MVE_PRED_MODE (GET_MODE (vctp_vpr_generated)))
+ return 1;
+
+ /* decrementunum is already known to be valid at this point. */
+ int decrementnum = arm_mve_get_vctp_lanes (vctp_insn);
+
+ rtx_insn *insn = 0;
+ rtx_insn *cur_insn = 0;
+ rtx_insn *seq;
+ auto_vec <rtx_insn *> props_zero_set;
+
+ /* Scan through the insns in the loop bb and emit the transformed bb
+ insns to a sequence. */
+ start_sequence ();
+ FOR_BB_INSNS (body, insn)
+ {
+ if (GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn))
+ continue;
+ else if (NOTE_P (insn))
+ emit_note ((enum insn_note)NOTE_KIND (insn));
+ else if (DEBUG_INSN_P (insn))
+ emit_debug_insn (PATTERN (insn));
+ else if (!INSN_P (insn))
+ {
+ end_sequence ();
+ return 1;
+ }
+ /* If the transformation is successful we no longer need the vctp
+ instruction. */
+ else if (insn == vctp_insn)
+ continue;
+ /* If the insn pattern requires the use of the VPR value from the
+ vctp as an input parameter for predication. */
+ else if (arm_mve_insn_predicated_by (insn, vctp_vpr_generated))
+ {
+ /* Check whether this INSN propagates the zeroed tail-predication
+ lanes. */
+ if (arm_mve_propagate_zero_pred_p (&props_zero_set, insn,
+ vctp_vpr_generated))
+ props_zero_set.safe_push (insn);
+ bool success = arm_emit_mve_unpredicated_insn_to_seq (insn);
+ if (!success)
+ {
+ end_sequence ();
+ return 1;
+ }
+ }
+ /* If the insn isn't VPT predicated on vctp_vpr_generated, we need to
+ make sure that it is still valid within the dlstp/letp loop. */
+ else
+ {
+ /* If this instruction USE-s the vctp_vpr_generated other than for
+ predication, this blocks the transformation as we are not allowed
+ to optimise the VPR value away. */
+ df_ref insn_uses = NULL;
+ FOR_EACH_INSN_USE (insn_uses, insn)
+ {
+ if (rtx_equal_p (vctp_vpr_generated, DF_REF_REG (insn_uses)))
+ {
+ end_sequence ();
+ return 1;
+ }
+ }
+ /* If within the loop we have an MVE vector instruction that is
+ unpredicated, the dlstp/letp looping will add implicit
+ predication to it. This will result in a change in behaviour
+ of the instruction, so we need to find out if any instructions
+ that feed into the current instruction were implicitly
+ predicated. */
+ if (MVE_VPT_PREDICABLE_INSN_P (insn)
+ && !arm_mve_impl_predicated_p (&props_zero_set, insn,
+ vctp_vpr_generated))
+ {
+ end_sequence ();
+ return 1;
+ }
+ emit_insn (PATTERN (insn));
+ }
+ }
+ seq = get_insns ();
+ end_sequence ();
+
+ /* Re-write the entire BB contents with the transformed
+ sequence. */
+ FOR_BB_INSNS_SAFE (body, insn, cur_insn)
+ if (!(GET_CODE (insn) == CODE_LABEL || NOTE_INSN_BASIC_BLOCK_P (insn)))
+ delete_insn (insn);
+
+ emit_insn_after (seq, BB_END (body));
+
+ /* The transformation has succeeded, so now modify the "count"
+ (a.k.a. niter_expr) for the middle-end. Also set noloop_assumptions
+ to NULL to stop the middle-end from making assumptions about the
+ number of iterations. */
+ simple_loop_desc (body->loop_father)->niter_expr
+ = XVECEXP (SET_SRC (PATTERN (vctp_insn)), 0, 0);
+ simple_loop_desc (body->loop_father)->noloop_assumptions = NULL_RTX;
+ return decrementnum;
}
#if CHECKING_P
diff --git a/gcc/config/arm/arm.opt b/gcc/config/arm/arm.opt
index 0cd3fc2cd0c..d88c7a52e75 100644
--- a/gcc/config/arm/arm.opt
+++ b/gcc/config/arm/arm.opt
@@ -363,5 +363,8 @@ Target Joined RejectNegative String Var(arm_stack_protector_guard_offset_str)
Use an immediate to offset from the TLS register. This option is for use with
fstack-protector-guard=tls and not for use in user-land code.
+mdlstp
+Target Var(dlstp_enabled) Init(1) Undocumented
+
TargetVariable
long arm_stack_protector_guard_offset = 0
diff --git a/gcc/config/arm/iterators.md b/gcc/config/arm/iterators.md
index 8d066fcf05d..987602da1bf 100644
--- a/gcc/config/arm/iterators.md
+++ b/gcc/config/arm/iterators.md
@@ -2686,6 +2686,17 @@
(define_int_attr mrrc [(VUNSPEC_MRRC "mrrc") (VUNSPEC_MRRC2 "mrrc2")])
(define_int_attr MRRC [(VUNSPEC_MRRC "MRRC") (VUNSPEC_MRRC2 "MRRC2")])
+(define_int_attr dlstp_elemsize [(DLSTP8 "8") (DLSTP16 "16") (DLSTP32 "32")
+ (DLSTP64 "64")])
+
+(define_int_attr letp_num_lanes [(LETP8 "16") (LETP16 "8") (LETP32 "4")
+ (LETP64 "2")])
+(define_int_attr letp_num_lanes_neg [(LETP8 "-16") (LETP16 "-8") (LETP32 "-4")
+ (LETP64 "-2")])
+
+(define_int_attr letp_num_lanes_minus_1 [(LETP8 "15") (LETP16 "7") (LETP32 "3")
+ (LETP64 "1")])
+
(define_int_attr opsuffix [(UNSPEC_DOT_S "s8")
(UNSPEC_DOT_U "u8")
(UNSPEC_DOT_US "s8")
@@ -2926,6 +2937,10 @@
(define_int_iterator VSHLCQ_M [VSHLCQ_M_S VSHLCQ_M_U])
(define_int_iterator VQSHLUQ_M_N [VQSHLUQ_M_N_S])
(define_int_iterator VQSHLUQ_N [VQSHLUQ_N_S])
+(define_int_iterator DLSTP [DLSTP8 DLSTP16 DLSTP32
+ DLSTP64])
+(define_int_iterator LETP [LETP8 LETP16 LETP32
+ LETP64])
;; Define iterators for VCMLA operations
(define_int_iterator VCMLA_OP [UNSPEC_VCMLA
diff --git a/gcc/config/arm/mve.md b/gcc/config/arm/mve.md
index 9fe51298cdc..4b4d6298ffb 100644
--- a/gcc/config/arm/mve.md
+++ b/gcc/config/arm/mve.md
@@ -6930,3 +6930,53 @@
}
}
)
+
+;; Originally expanded by 'predicated_doloop_end'.
+;; In the rare situation where the branch is too far, we do also need to
+;; revert FPSCR.LTPSIZE back to 0x100 after the last iteration.
+(define_insn "predicated_doloop_end_internal<letp_num_lanes>"
+ [(set (pc)
+ (if_then_else
+ (gtu (plus:SI (reg:SI LR_REGNUM)
+ (const_int <letp_num_lanes_neg>))
+ (const_int <letp_num_lanes_minus_1>))
+ (match_operand 0 "" "")
+ (pc)))
+ (set (reg:SI LR_REGNUM)
+ (plus:SI (reg:SI LR_REGNUM) (const_int <letp_num_lanes_neg>)))
+ ;; We use UNSPEC here to guarantee this pattern can not be
+ ;; generated by a RTL optimization and be matched by other
+ ;; patterns, since this pattern is also responsible for turning off
+ ;; the tail predication machinery if we were to exit the loop.
+ ;; This is done by either the LETP or the LCTP instructions that
+ ;; this pattern generates.
+ (use (unspec:SI [(const_int 0)] LETP))
+ (clobber (reg:CC CC_REGNUM))]
+ "TARGET_HAVE_MVE"
+ {
+ if (get_attr_length (insn) == 4)
+ return "letp\t%|lr, %l0";
+ else
+ return "subs\t%|lr, #<letp_num_lanes>\n\tbhi\t%l0\n\tlctp";
+ }
+ [(set (attr "length")
+ (if_then_else
+ (ltu (minus (pc) (match_dup 0)) (const_int 1024))
+ (const_int 4)
+ (const_int 12)))
+ (set_attr "type" "branch")
+ (set_attr "conds" "unconditional")])
+
+(define_insn "dlstp<dlstp_elemsize>_insn"
+ [
+ (set (reg:SI LR_REGNUM)
+;; Similar to the previous pattern, we use UNSPEC here to make sure this
+;; rtx construct is not matched by other patterns, as this pattern is also
+;; responsible for setting the element size of the tail predication machinery
+;; using the dlsp.<size> instruction.
+ (unspec_volatile:SI [(match_operand:SI 0 "s_register_operand" "r")]
+ DLSTP))
+ ]
+ "TARGET_HAVE_MVE"
+ "dlstp.<dlstp_elemsize>\t%|lr, %0"
+ [(set_attr "type" "mve_misc")])
diff --git a/gcc/config/arm/thumb2.md b/gcc/config/arm/thumb2.md
index 84c9c3dfe80..66b3ae6040c 100644
--- a/gcc/config/arm/thumb2.md
+++ b/gcc/config/arm/thumb2.md
@@ -1613,7 +1613,7 @@
(use (match_operand 1 "" ""))] ; label
"TARGET_32BIT"
"
- {
+{
/* Currently SMS relies on the do-loop pattern to recognize loops
where (1) the control part consists of all insns defining and/or
using a certain 'count' register and (2) the loop count can be
@@ -1623,41 +1623,75 @@
Also used to implement the low over head loops feature, which is part of
the Armv8.1-M Mainline Low Overhead Branch (LOB) extension. */
- if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
- {
- rtx s0;
- rtx bcomp;
- rtx loc_ref;
- rtx cc_reg;
- rtx insn;
- rtx cmp;
-
- if (GET_MODE (operands[0]) != SImode)
- FAIL;
-
- s0 = operands [0];
-
- /* Low over head loop instructions require the first operand to be LR. */
- if (TARGET_HAVE_LOB && arm_target_insn_ok_for_lob (operands [1]))
- s0 = gen_rtx_REG (SImode, LR_REGNUM);
-
- if (TARGET_THUMB2)
- insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0, GEN_INT (-1)));
- else
- insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
-
- cmp = XVECEXP (PATTERN (insn), 0, 0);
- cc_reg = SET_DEST (cmp);
- bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
- loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands [1]);
- emit_jump_insn (gen_rtx_SET (pc_rtx,
- gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
- loc_ref, pc_rtx)));
- DONE;
- }
- else
- FAIL;
- }")
+ if (optimize > 0 && (flag_modulo_sched || TARGET_HAVE_LOB))
+ {
+ rtx s0;
+ rtx bcomp;
+ rtx loc_ref;
+ rtx cc_reg;
+ rtx insn;
+ rtx cmp;
+ int decrement_num;
+
+ if (GET_MODE (operands[0]) != SImode)
+ FAIL;
+
+ s0 = operands[0];
+
+ if (TARGET_HAVE_LOB
+ && arm_target_bb_ok_for_lob (BLOCK_FOR_INSN (operands[1])))
+ {
+ /* If we have a compatible MVE target, try and analyse the loop
+ contents to determine if we can use predicated dlstp/letp
+ looping. These patterns implicitly use LR as the loop counter. */
+ if (TARGET_HAVE_MVE
+ && ((decrement_num = arm_attempt_dlstp_transform (operands[1]))
+ != 1))
+ {
+ loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+ switch (decrement_num)
+ {
+ case 2:
+ insn = gen_predicated_doloop_end_internal2 (loc_ref);
+ break;
+ case 4:
+ insn = gen_predicated_doloop_end_internal4 (loc_ref);
+ break;
+ case 8:
+ insn = gen_predicated_doloop_end_internal8 (loc_ref);
+ break;
+ case 16:
+ insn = gen_predicated_doloop_end_internal16 (loc_ref);
+ break;
+ default:
+ gcc_unreachable ();
+ }
+ emit_jump_insn (insn);
+ DONE;
+ }
+ /* Remaining LOB cases need to explicitly use LR. */
+ s0 = gen_rtx_REG (SImode, LR_REGNUM);
+ }
+
+ /* Otherwise, try standard decrement-by-one dls/le looping. */
+ if (TARGET_THUMB2)
+ insn = emit_insn (gen_thumb2_addsi3_compare0 (s0, s0,
+ GEN_INT (-1)));
+ else
+ insn = emit_insn (gen_addsi3_compare0 (s0, s0, GEN_INT (-1)));
+
+ cmp = XVECEXP (PATTERN (insn), 0, 0);
+ cc_reg = SET_DEST (cmp);
+ bcomp = gen_rtx_NE (VOIDmode, cc_reg, const0_rtx);
+ loc_ref = gen_rtx_LABEL_REF (VOIDmode, operands[1]);
+ emit_jump_insn (gen_rtx_SET (pc_rtx,
+ gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp,
+ loc_ref, pc_rtx)));
+ DONE;
+ }
+ else
+ FAIL;
+}")
(define_insn "*clear_apsr"
[(unspec_volatile:SI [(const_int 0)] VUNSPEC_CLRM_APSR)
@@ -1755,7 +1789,37 @@
{
if (REGNO (operands[0]) == LR_REGNUM)
{
- emit_insn (gen_dls_insn (operands[0]));
+ /* Pick out the number by which we are decrementing the loop counter
+ in every iteration. If it's > 1, then use dlstp. */
+ int const_int_dec_num
+ = abs (INTVAL (XEXP (XEXP (XVECEXP (PATTERN (operands[1]), 0, 1),
+ 1),
+ 1)));
+ switch (const_int_dec_num)
+ {
+ case 16:
+ emit_insn (gen_dlstp8_insn (operands[0]));
+ break;
+
+ case 8:
+ emit_insn (gen_dlstp16_insn (operands[0]));
+ break;
+
+ case 4:
+ emit_insn (gen_dlstp32_insn (operands[0]));
+ break;
+
+ case 2:
+ emit_insn (gen_dlstp64_insn (operands[0]));
+ break;
+
+ case 1:
+ emit_insn (gen_dls_insn (operands[0]));
+ break;
+
+ default:
+ gcc_unreachable ();
+ }
DONE;
}
else
diff --git a/gcc/config/arm/types.md b/gcc/config/arm/types.md
index e2b70da1001..9527bdb9e87 100644
--- a/gcc/config/arm/types.md
+++ b/gcc/config/arm/types.md
@@ -574,6 +574,7 @@
; mve_move
; mve_store
; mve_load
+; mve_misc
(define_attr "type"
"adc_imm,\
@@ -1126,7 +1127,8 @@
ls64,\
mve_move,\
mve_store,\
- mve_load"
+ mve_load, \
+ mve_misc"
(cond [(eq_attr "autodetect_type" "alu_shift_lsr_op2,alu_shift_asr_op2")
(const_string "alu_shift_imm_other")
(eq_attr "autodetect_type" "alu_shift_lsl_op2")
@@ -1292,7 +1294,7 @@
;; No otherwise.
(define_attr "is_mve_type" "yes,no"
(if_then_else (eq_attr "type"
- "mve_move, mve_load, mve_store, mrs")
+ "mve_move, mve_load, mve_store, mrs, mve_misc")
(const_string "yes")
(const_string "no")))
diff --git a/gcc/config/arm/unspecs.md b/gcc/config/arm/unspecs.md
index 46ac8b37157..f5f4d154364 100644
--- a/gcc/config/arm/unspecs.md
+++ b/gcc/config/arm/unspecs.md
@@ -591,6 +591,10 @@
VADDLVQ_U
VCTP
VCTP_M
+ LETP8
+ LETP16
+ LETP32
+ LETP64
VPNOT
VCREATEQ_F
VCVTQ_N_TO_F_S
@@ -1259,6 +1263,14 @@
UQRSHLL_48
SQRSHRL_64
SQRSHRL_48
- VSHLCQ_M_
REINTERPRET
])
+
+; DLSTP unspecs must be volatile to guarantee the scheduler does not reschedule
+; these instructions within the loop preheader.
+(define_c_enum "unspecv" [
+ DLSTP8
+ DLSTP16
+ DLSTP32
+ DLSTP64
+])
diff --git a/gcc/testsuite/gcc.target/arm/lob.h b/gcc/testsuite/gcc.target/arm/lob.h
index feaae7cc899..3941fe7a8b6 100644
--- a/gcc/testsuite/gcc.target/arm/lob.h
+++ b/gcc/testsuite/gcc.target/arm/lob.h
@@ -1,15 +1,131 @@
#include <string.h>
-
+#include <stdint.h>
/* Common code for lob tests. */
#define NO_LOB asm volatile ("@ clobber lr" : : : "lr" )
-#define N 10000
+#define N 100
+
+static void
+reset_data (int *a, int *b, int *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (b, -1, x * sizeof (*b));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+reset_data64 (int64_t *a, int64_t *c, int x)
+{
+ memset (a, -1, x * sizeof (*a));
+ memset (c, 0, x * sizeof (*c));
+}
+
+static void
+check_plus (int *a, int *b, int *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
+
+static void
+check_plus8 (int8_t *a, int8_t *b, int8_t *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
+
+static void
+check_plus16 (int16_t *a, int16_t *b, int16_t *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
+
+static void
+check_plus32 (int32_t *a, int32_t *b, int32_t *c, int x)
+{
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != (a[i] + b[i])) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
+}
static void
-reset_data (int *a, int *b, int *c)
+check_memcpy64 (int64_t *a, int64_t *c, int x)
{
- memset (a, -1, N * sizeof (*a));
- memset (b, -1, N * sizeof (*b));
- memset (c, -1, N * sizeof (*c));
+ for (int i = 0; i < N; i++)
+ {
+ NO_LOB;
+ if (i < x)
+ {
+ if (c[i] != a[i]) abort ();
+ }
+ else
+ {
+ if (c[i] != 0) abort ();
+ }
+ }
}
diff --git a/gcc/testsuite/gcc.target/arm/lob1.c b/gcc/testsuite/gcc.target/arm/lob1.c
index ba5c82cd55c..c8ce653a5c3 100644
--- a/gcc/testsuite/gcc.target/arm/lob1.c
+++ b/gcc/testsuite/gcc.target/arm/lob1.c
@@ -54,29 +54,18 @@ loop3 (int *a, int *b, int *c)
} while (i < N);
}
-void
-check (int *a, int *b, int *c)
-{
- for (int i = 0; i < N; i++)
- {
- NO_LOB;
- if (c[i] != a[i] + b[i])
- abort ();
- }
-}
-
int
main (void)
{
- reset_data (a, b, c);
+ reset_data (a, b, c, N);
loop1 (a, b ,c);
- check (a, b ,c);
- reset_data (a, b, c);
+ check_plus (a, b, c, N);
+ reset_data (a, b, c, N);
loop2 (a, b ,c);
- check (a, b ,c);
- reset_data (a, b, c);
+ check_plus (a, b, c, N);
+ reset_data (a, b, c, N);
loop3 (a, b ,c);
- check (a, b ,c);
+ check_plus (a, b, c, N);
return 0;
}
diff --git a/gcc/testsuite/gcc.target/arm/lob6.c b/gcc/testsuite/gcc.target/arm/lob6.c
index 17b6124295e..4fe116e2c2b 100644
--- a/gcc/testsuite/gcc.target/arm/lob6.c
+++ b/gcc/testsuite/gcc.target/arm/lob6.c
@@ -79,14 +79,14 @@ check (void)
int
main (void)
{
- reset_data (a1, b1, c1);
- reset_data (a2, b2, c2);
+ reset_data (a1, b1, c1, N);
+ reset_data (a2, b2, c2, N);
loop1 (a1, b1, c1);
ref1 (a2, b2, c2);
check ();
- reset_data (a1, b1, c1);
- reset_data (a2, b2, c2);
+ reset_data (a1, b1, c1, N);
+ reset_data (a2, b2, c2, N);
loop2 (a1, b1, c1);
ref2 (a2, b2, c2);
check ();
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
new file mode 100644
index 00000000000..6e6da3d3d59
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-1.c
@@ -0,0 +1,146 @@
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+
+#define IMM 5
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED) \
+void test_##NAME##PRED##_##SIGN##BITS (TYPE##BITS##_t *a, TYPE##BITS##_t *b, TYPE##BITS##_t *c, int n) \
+{ \
+ while (n > 0) \
+ { \
+ mve_pred16_t p = vctp##BITS##q (n); \
+ TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p); \
+ TYPE##BITS##x##LANES##_t vb = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (b, p); \
+ TYPE##BITS##x##LANES##_t vc = NAME##PRED##_##SIGN##BITS (va, vb, p); \
+ vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p); \
+ c += LANES; \
+ a += LANES; \
+ b += LANES; \
+ n -= LANES; \
+ } \
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY(BITS, LANES, LDRSTRYTPE, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY(NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (8, 16, b, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (16, 8, h, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY (32, 4, w, NAME, PRED)
+
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vmulq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vsubq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vhaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY (vorrq, _x)
+
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_M(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED) \
+void test_##NAME##PRED##_##SIGN##BITS (TYPE##BITS##x##LANES##_t __inactive, TYPE##BITS##_t *a, TYPE##BITS##_t *b, TYPE##BITS##_t *c, int n) \
+{ \
+ while (n > 0) \
+ { \
+ mve_pred16_t p = vctp##BITS##q (n); \
+ TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p); \
+ TYPE##BITS##x##LANES##_t vb = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (b, p); \
+ TYPE##BITS##x##LANES##_t vc = NAME##PRED##_##SIGN##BITS (__inactive, va, vb, p); \
+ vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p); \
+ c += LANES; \
+ a += LANES; \
+ b += LANES; \
+ n -= LANES; \
+ } \
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M(BITS, LANES, LDRSTRYTPE, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY_M (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY_M (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M(NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (8, 16, b, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (16, 8, h, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M (32, 4, w, NAME, PRED)
+
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vmulq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vsubq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vhaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M (vorrq, _m)
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_N(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED) \
+void test_##NAME##PRED##_n_##SIGN##BITS (TYPE##BITS##_t *a, TYPE##BITS##_t *c, int n) \
+{ \
+ while (n > 0) \
+ { \
+ mve_pred16_t p = vctp##BITS##q (n); \
+ TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p); \
+ TYPE##BITS##x##LANES##_t vc = NAME##PRED##_n_##SIGN##BITS (va, IMM, p); \
+ vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p); \
+ c += LANES; \
+ a += LANES; \
+ n -= LANES; \
+ } \
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N(BITS, LANES, LDRSTRYTPE, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY_N (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY_N (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N(NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (8, 16, b, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (16, 8, h, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_N (32, 4, w, NAME, PRED)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vaddq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vmulq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vsubq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vhaddq, _x)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vbrsrq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vshlq, _x)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_N (vshrq, _x)
+
+#define TEST_COMPILE_IN_DLSTP_TERNARY_M_N(BITS, LANES, LDRSTRYTPE, TYPE, SIGN, NAME, PRED) \
+void test_##NAME##PRED##_n_##SIGN##BITS (TYPE##BITS##x##LANES##_t __inactive, TYPE##BITS##_t *a, TYPE##BITS##_t *c, int n) \
+{ \
+ while (n > 0) \
+ { \
+ mve_pred16_t p = vctp##BITS##q (n); \
+ TYPE##BITS##x##LANES##_t va = vldr##LDRSTRYTPE##q_z_##SIGN##BITS (a, p); \
+ TYPE##BITS##x##LANES##_t vc = NAME##PRED##_n_##SIGN##BITS (__inactive, va, IMM, p); \
+ vstr##LDRSTRYTPE##q_p_##SIGN##BITS (c, vc, p); \
+ c += LANES; \
+ a += LANES; \
+ n -= LANES; \
+ } \
+}
+
+#define TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N(BITS, LANES, LDRSTRYTPE, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY_M_N (BITS, LANES, LDRSTRYTPE, int, s, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_TERNARY_M_N (BITS, LANES, LDRSTRYTPE, uint, u, NAME, PRED)
+
+#define TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N(NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (8, 16, b, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (16, 8, h, NAME, PRED) \
+TEST_COMPILE_IN_DLSTP_SIGNED_UNSIGNED_TERNARY_M_N (32, 4, w, NAME, PRED)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vaddq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vmulq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vsubq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vhaddq, _m)
+
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vbrsrq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshlq, _m)
+TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY_M_N (vshrq, _m)
+
+/* The final number of DLSTPs currently is calculated by the number of
+ `TEST_COMPILE_IN_DLSTP_INTBITS_SIGNED_UNSIGNED_TERNARY.*` macros * 6. */
+/* { dg-final { scan-assembler-times {\tdlstp} 144 } } */
+/* { dg-final { scan-assembler-times {\tletp} 144 } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
new file mode 100644
index 00000000000..84f4a2fc4f9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-2.c
@@ -0,0 +1,749 @@
+
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps -fno-schedule-insns2 " } */
+/* { dg-add-options arm_v8_1m_mve } */
+/* { dg-additional-options "-mtune=cortex-m55" } */
+/* { dg-final { check-function-bodies "**" "" "" } } */
+
+#include <arm_mve.h>
+/* Using a >=1 condition. */
+void test1 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n >= 1)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c+=4;
+ a+=4;
+ b+=4;
+ n-=4;
+ }
+}
+/*
+** test1:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vldrw.32 q[0-9]+, \[r1\], #16
+** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Test a for loop format of decrementing to zero */
+int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
+void test2 (int32_t *b, int num_elems)
+{
+ for (int i = num_elems; i > 0; i-= 4)
+ {
+ mve_pred16_t p = vctp32q (i);
+ int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
+ vstrwq_p_s32 (b + i, va, p);
+ }
+}
+/*
+** test2:
+**...
+** dlstp.32 lr, r1
+**...
+** vldrw.32 (q[0-9]+), \[r3\], #-16
+** vstrw.32 \1, \[r0\], #-16
+** letp lr, .*
+**...
+*/
+
+/* Iteration counter counting up to num_iter. */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = 0; i < num_iter; i++)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n-=16;
+ a += 16;
+ b += 16;
+ c += 16;
+ }
+}
+
+/*
+** test3:
+**...
+** dlstp.8 lr, r3
+**...
+** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
+** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrb.8 \3, \[(r[0-9]+|ip)\]
+**...
+** letp lr, .*
+**...
+*/
+
+/* Iteration counter counting down from num_iter. */
+void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = num_iter; i > 0; i--)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n-=16;
+ a += 16;
+ b += 16;
+ c += 16;
+ }
+}
+/*
+** test4:
+**...
+** dlstp.8 lr, r3
+**...
+** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
+** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrb.8 \3, \[(r[0-9]+|ip)\]
+**...
+** letp lr, .*
+**...
+*/
+
+/* Using an unpredicated arithmetic instruction within the loop. */
+void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ /* Is affected by implicit predication, because vb also
+ came from an unpredicated load, but there is no functional
+ problem, because the result is used in a predicated store. */
+ uint8x16_t vc = vaddq_u8 (va, vb);
+ uint8x16_t vd = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ vstrbq_p_u8 (d, vd, p);
+ n-=16;
+ a += 16;
+ b += 16;
+ c += 16;
+ }
+}
+
+/*
+** test5:
+**...
+** dlstp.8 lr, r[0-9]+
+**...
+** vldrb.8 q[0-9]+, \[r1\]
+** vldrb.8 q[0-9]+, \[r2\]
+**...
+** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
+**...
+** vstrb.8 \1, \[r2\]
+** vstrb.8 \1, \[r3\]
+** letp lr, .*
+**...
+*/
+
+/* Using a different VPR value for one instruction in the loop. */
+void test6 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/*
+** test6:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vpst
+** vldrwt.32 q[0-9]+, \[r1\], #16
+** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using another VPR value in the loop, with a vctp.
+ The doloop logic will always try to do the transform on the first
+ vctp it encounters, so this is still expected to work. */
+void test7 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ mve_pred16_t p1 = vctp32q (g);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+/*
+** test7:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vpst
+** vldrwt.32 q[0-9]+, \[r1\], #16
+** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vctp,
+ but this time the p1 will also change in every loop (still fine) */
+void test8 (int32_t *a, int32_t *b, int32_t *c, int n, int g)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ mve_pred16_t p1 = vctp32q (g);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ g++;
+ }
+}
+
+/*
+** test8:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vctp.32 r4
+** vpst
+** vldrwt.32 q[0-9]+, \[r1\], #16
+**...
+** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vctp_m
+ that is independent of the loop vctp VPR. */
+void test9 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ mve_pred16_t p2 = vctp32q_m (n, p1);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/*
+** test9:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vmsr p0, (r[0-9]+) @ movhi
+** vpst
+** vctpt.32 r3
+** vmrs (r[0-9]+), p0 @ movhi
+** vmsr p0, \1 @ movhi
+** vpst
+** vldrwt.32 q[0-9]+, \[r1\], #16
+** vmsr p0, \2 @ movhi
+** vpst
+** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+**...
+** vstrw.32 \3, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop,
+ with a vctp_m that is tied to the base vctp VPR. This
+ is still fine, because the vctp_m will be transformed
+ into a vctp and be implicitly predicated. */
+void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ mve_pred16_t p1 = vctp32q_m (n, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p1);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+/*
+ We don't need that extra vctp in the loop, but we currently do not optimize
+ it away, however, it is not wrong to use it...
+*/
+/*
+** test10:
+**...
+** dlstp.32 lr, r3
+** vctp.32 r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+**...
+** vpst
+** vldrwt.32 q[0-9]+, \[r1\], #16
+** vpst
+** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp. */
+void test11 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ mve_pred16_t p1 = vcmpeqq_s32 (va, vb);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p1);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/*
+** test11:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vldrw.32 q[0-9]+, \[r1\], #16
+** vcmp.i32 eq, q[0-9]+, q[0-9]+
+** vpst
+** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp_m. */
+void test12 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p1);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/*
+** test12:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vldrw.32 q[0-9]+, \[r1\], #16
+** vmsr p0, (r[0-9]+|ip) @ movhi
+** vpst
+** vcmpt.i32 eq, q[0-9]+, q[0-9]+
+** vpst
+** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \2, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Generating and using a different VPR value in the loop, with a vcmp_m
+ that is tied to the base vctp VPR (same as above, this will be turned
+ into a vcmp and be implicitly predicated). */
+void test13 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p1)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ mve_pred16_t p2 = vcmpeqq_m_s32 (va, vb, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p2);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/*
+** test13:
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vldrw.32 q[0-9]+, \[r1\], #16
+** vcmp.i32 eq, q[0-9]+, q[0-9]+
+** vpst
+** vaddt.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
+/* Similar to test27 in dsltp-invalid-asm.c, but use a predicated load to make
+ it safe to implicitly predicate the vaddv. */
+void test14 (int32_t *a, int32_t *c, int n)
+{
+ int32_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ res += vaddvq_s32 (va);
+ int32x4_t vc = vdupq_n_s32 (res);
+ vstrwq_p_s32 (c, vc, p);
+ a += 4;
+ n -= 4;
+ }
+}
+
+/*
+** test14:
+**...
+** dlstp.32 lr, r2
+** vldrw.32 (q[0-9]+), \[r0\], #16
+** vaddv.s32 (r[0-9]+|ip), \1
+** add (r[0-9]+|ip), \3, \2
+** vdup.32 (q[0-9]+), \3
+** vstrw.32 \4, \[r1\]
+** letp lr, .*
+**...
+*/
+
+uint8_t test15 (uint8_t *a, uint8_t *b, int n)
+{
+ uint8_t res = 0;
+ uint8x16_t vc = vdupq_n_u8 (0);
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ vc = vaddq_m (vc, va, vc, p);
+ res = vgetq_lane (vc, 5);
+
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return res;
+}
+
+/*
+** test15:
+**...
+** dlstp.8 lr, r2
+**...
+** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
+**...
+** letp lr, .*
+** vmov.u8 r[0-9]+, \2\[5\]
+**...
+*/
+
+uint8_t test16 (uint8_t *a, uint8_t *b, int n)
+{
+ uint8_t res = 0;
+ uint8x16_t vc = vdupq_n_u8 (0);
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ vc = vaddq (va, vc);
+ vc = vaddq_m (vc, va, vc, p);
+ res = vgetq_lane (vc, 5);
+
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return res;
+}
+
+/*
+** test16:
+**...
+** dlstp.8 lr, r2
+**...
+** vldrb.8 q[0-9]+, \[(r[0-9]+|ip)\]
+**...
+** vadd.i8 (q[0-9]+), q[0-9]+, q[0-9]+
+** vadd.i8 \2, q[0-9]+, q[0-9]+
+** letp lr, .*
+** vmov.u8 r[0-9]+, \2\[5\]
+**...
+*/
+
+
+
+/* Using an across-vector unpredicated instruction in a valid way.
+ This tests that "vc" has correctly masked the risky "vb". */
+uint16_t test18 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16x8_t vb = vldrhq_u16 (b);
+ uint16_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t va = vldrhq_z_u16 (a, p);
+ uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
+ res += vaddvq_u16 (vc);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+/*
+** test18:
+**...
+** dlstp.16 lr, r3
+** vldrh.16 (q[0-9]+), \[r2\], #16
+** vadd.i16 \1, q[0-9]+, q[0-9]+
+** vaddv.u16 (r[0-9]+|ip), \1
+** add (r[0-9]+|ip), \3, \2
+** uxth \3, \3
+** letp lr, .*
+**...
+*/
+
+/* Using an across-vector unpredicated instruction with implicit scalar adding from outside the loop. */
+uint16_t test19 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16x8_t vb = vldrhq_u16 (b);
+ uint16_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t va = vldrhq_z_u16 (a, p);
+ uint16x8_t vc = vaddq_m_u16 (va, va, vb, p);
+ res = vaddvaq_u16 (res, vc);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+/*
+** test19:
+**...
+** dlstp.16 lr, r3
+** vldrh.16 (q[0-9]+), \[r2\], #16
+** vadd.i16 \1, q[0-9]+, q[0-9]+
+** vaddva.u16 (r[0-9]+|ip), \1
+** uxth \2, \2
+** letp lr, .*
+**...
+*/
+
+
+/* Using an across-vector predicated instruction in a valid way. */
+uint16_t test20 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t va = vldrhq_u16 (a);
+ res = vaddvaq_p_u16 (res, va, p);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+/* The uxth could be moved outside the loop. */
+/*
+** test20:
+**...
+** dlstp.16 lr, r3
+** vldrh.16 (q[0-9]+), \[r2\], #16
+** vaddva.u16 (r[0-9]+|ip), \1
+** uxth \2, \2
+** letp lr, .*
+**...
+*/
+
+/* Using an across-vector predicated instruction in a valid way. */
+uint16_t test21 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t va = vldrhq_u16 (a);
+ res++;
+ res = vaddvaq_p_u16 (res, va, p);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+/* Also think it'd be safe to move uxth outside of the loop here. */
+/*
+** test21:
+**...
+** dlstp.16 lr, r3
+** vldrh.16 (q[0-9]+), \[r2\], #16
+** adds (r[0-9]+|ip), \2, #1
+** uxth \2, \2
+** vaddva.u16 \2, \1
+** uxth \2, \2
+** letp lr, .*
+**...
+*/
+
+int test22 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ res = vmaxvq (res, va);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+/*
+** test22:
+**...
+** dlstp.8 lr, r3
+**...
+** vldrb.8 (q[0-9]+), \[r[0-9]+\]
+**...
+** vmaxv.u8 (r[0-9]+|ip), \1
+** uxtb \2, \2
+** letp lr, .*
+**...
+*/
+
+int test23 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+ int res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ res = vmaxavq (res, va);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+/*
+** test23:
+**...
+** dlstp.8 lr, r3
+**...
+** vldrb.8 (q[0-9]+), \[r3\]
+**...
+** vmaxav.s8 (r[0-9]+|ip), \1
+** uxtb \2, \2
+** letp lr, .*
+**...
+*/
+
+/* Like test1, but update n before vctp, meaning we should only iterate for n-4
+ elements. */
+void test24 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n >= 1)
+ {
+ n-=4;
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c+=4;
+ a+=4;
+ b+=4;
+ }
+}
+/*
+** test24:
+**...
+** subs r3, r3, #4
+**...
+** dlstp.32 lr, r3
+** vldrw.32 q[0-9]+, \[r0\], #16
+** vldrw.32 q[0-9]+, \[r1\], #16
+** vadd.i32 (q[0-9]+), q[0-9]+, q[0-9]+
+** vstrw.32 \1, \[r2\], #16
+** letp lr, .*
+**...
+*/
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
new file mode 100644
index 00000000000..c784f540131
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-compile-asm-3.c
@@ -0,0 +1,46 @@
+
+/* { dg-do compile } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+
+/* We don't support pattern recognition of signed N values when computing num_iter. */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = 0; i < num_iter; i++)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n-=16;
+ a += 16;
+ b += 16;
+ c += 16;
+ }
+}
+
+/* Using a predicated vcmp to generate a new predicate value in the
+ loop and then using it in a predicated store insn. */
+void test17 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_s32 (va, vb);
+ mve_pred16_t p1 = vcmpeqq_m_s32 (va, vc, p);
+ vstrwq_p_s32 (c, vc, p1);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+/* This is an example of a loop that we could tail predicate but currently don't. */
+/* { dg-final { scan-assembler "letp" { xfail *-*-* } } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
new file mode 100644
index 00000000000..6966a396604
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8-run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+#include "dlstp-int16x8.c"
+
+int main ()
+{
+ int i;
+ int16_t temp1[N];
+ int16_t temp2[N];
+ int16_t temp3[N];
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 0);
+ check_plus16 (temp1, temp2, temp3, 0);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 1);
+ check_plus16 (temp1, temp2, temp3, 1);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 7);
+ check_plus16 (temp1, temp2, temp3, 7);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 8);
+ check_plus16 (temp1, temp2, temp3, 8);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 9);
+ check_plus16 (temp1, temp2, temp3, 9);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 16);
+ check_plus16 (temp1, temp2, temp3, 16);
+
+ reset_data16 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 17);
+ check_plus16 (temp1, temp2, temp3, 17);
+
+ reset_data16 (temp1, temp2, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
new file mode 100644
index 00000000000..33632c5f14d
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int16x8.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void __attribute__ ((noinline)) test (int16_t *a, int16_t *b, int16_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ int16x8_t va = vldrhq_z_s16 (a, p);
+ int16x8_t vb = vldrhq_z_s16 (b, p);
+ int16x8_t vc = vaddq_x_s16 (va, vb, p);
+ vstrhq_p_s16 (c, vc, p);
+ c+=8;
+ a+=8;
+ b+=8;
+ n-=8;
+ }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.16} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c
new file mode 100644
index 00000000000..6833dddde92
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4-run.c
@@ -0,0 +1,45 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int32x4.c"
+
+int main ()
+{
+ int i;
+ int32_t temp1[N];
+ int32_t temp2[N];
+ int32_t temp3[N];
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 0);
+ check_plus32 (temp1, temp2, temp3, 0);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 1);
+ check_plus32 (temp1, temp2, temp3, 1);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 3);
+ check_plus32 (temp1, temp2, temp3, 3);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 4);
+ check_plus32 (temp1, temp2, temp3, 4);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 5);
+ check_plus32 (temp1, temp2, temp3, 5);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 8);
+ check_plus32 (temp1, temp2, temp3, 8);
+
+ reset_data32 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 9);
+ check_plus32 (temp1, temp2, temp3, 9);
+
+ reset_data32 (temp1, temp2, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c
new file mode 100644
index 00000000000..5d09f784b77
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int32x4.c
@@ -0,0 +1,31 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void __attribute__ ((noinline)) test (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c+=4;
+ a+=4;
+ b+=4;
+ n-=4;
+ }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.32} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c
new file mode 100644
index 00000000000..cc0b9ce7ee9
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2-run.c
@@ -0,0 +1,48 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int64x2.c"
+
+int main ()
+{
+ int i;
+ int64_t temp1[N];
+ int64_t temp3[N];
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 0);
+ check_memcpy64 (temp1, temp3, 0);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 1);
+ check_memcpy64 (temp1, temp3, 1);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 2);
+ check_memcpy64 (temp1, temp3, 2);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 3);
+ check_memcpy64 (temp1, temp3, 3);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 4);
+ check_memcpy64 (temp1, temp3, 4);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 5);
+ check_memcpy64 (temp1, temp3, 5);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 6);
+ check_memcpy64 (temp1, temp3, 6);
+
+ reset_data64 (temp1, temp3, N);
+ test (temp1, temp3, 7);
+ check_memcpy64 (temp1, temp3, 7);
+
+ reset_data64 (temp1, temp3, N);
+}
+
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c
new file mode 100644
index 00000000000..21e882424ec
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int64x2.c
@@ -0,0 +1,28 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void __attribute__ ((noinline)) test (int64_t *a, int64_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp64q (n);
+ int64x2_t va = vldrdq_gather_offset_z_s64 (a, vcreateq_u64 (0, 8), p);
+ vstrdq_scatter_offset_p_s64 (c, vcreateq_u64 (0, 8), va, p);
+ c+=2;
+ a+=2;
+ n-=2;
+ }
+}
+
+/* { dg-final { scan-assembler-times {\tdlstp.64} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c
new file mode 100644
index 00000000000..d46571f329c
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16-run.c
@@ -0,0 +1,44 @@
+/* { dg-do run { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-require-effective-target arm_mve_hw } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include "dlstp-int8x16.c"
+
+int main ()
+{
+ int i;
+ int8_t temp1[N];
+ int8_t temp2[N];
+ int8_t temp3[N];
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 0);
+ check_plus8 (temp1, temp2, temp3, 0);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 1);
+ check_plus8 (temp1, temp2, temp3, 1);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 15);
+ check_plus8 (temp1, temp2, temp3, 15);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 16);
+ check_plus8 (temp1, temp2, temp3, 16);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 17);
+ check_plus8 (temp1, temp2, temp3, 17);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 32);
+ check_plus8 (temp1, temp2, temp3, 32);
+
+ reset_data8 (temp1, temp2, temp3, N);
+ test (temp1, temp2, temp3, 33);
+ check_plus8 (temp1, temp2, temp3, 33);
+
+ reset_data8 (temp1, temp2, temp3, N);
+}
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c
new file mode 100644
index 00000000000..d5f22b50262
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-int8x16.c
@@ -0,0 +1,32 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O2 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <arm_mve.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include "../lob.h"
+
+void __attribute__ ((noinline)) test (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ int8x16_t vb = vldrbq_z_s8 (b, p);
+ int8x16_t vc = vaddq_x_s8 (va, vb, p);
+ vstrbq_p_s8 (c, vc, p);
+ c+=16;
+ a+=16;
+ b+=16;
+ n-=16;
+ }
+}
+
+
+/* { dg-final { scan-assembler-times {\tdlstp.8} 1 } } */
+/* { dg-final { scan-assembler-times {\tletp} 1 } } */
+/* { dg-final { scan-assembler-not "\tvctp" } } */
+/* { dg-final { scan-assembler-not "\tvpst" } } */
+/* { dg-final { scan-assembler-not "p0" } } */
diff --git a/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
new file mode 100644
index 00000000000..26df2d30523
--- /dev/null
+++ b/gcc/testsuite/gcc.target/arm/mve/dlstp-invalid-asm.c
@@ -0,0 +1,521 @@
+/* { dg-do compile { target { arm*-*-* } } } */
+/* { dg-require-effective-target arm_v8_1m_mve_ok } */
+/* { dg-options "-O3 -save-temps" } */
+/* { dg-add-options arm_v8_1m_mve } */
+
+#include <limits.h>
+#include <arm_mve.h>
+
+/* Terminating on a non-zero number of elements. */
+void test0 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ while (n > 1)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n -= 16;
+ }
+}
+
+/* Terminating on n >= 0. */
+void test1 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ while (n >= 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n -= 16;
+ }
+}
+
+/* Similar, terminating on a non-zero number of elements, but in a for loop
+ format. */
+int32_t a[] = {0, 1, 2, 3, 4, 5, 6, 7};
+void test2 (int32_t *b, int num_elems)
+{
+ for (int i = num_elems; i >= 2; i-= 4)
+ {
+ mve_pred16_t p = vctp32q (i);
+ int32x4_t va = vldrwq_z_s32 (&(a[i]), p);
+ vstrwq_p_s32 (b + i, va, p);
+ }
+}
+
+/* Iteration counter counting up to num_iter, with a non-zero starting num. */
+void test3 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = 1; i < num_iter; i++)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n -= 16;
+ }
+}
+
+/* Iteration counter counting up to num_iter, with a larger increment */
+void test4 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int num_iter = (n + 15)/16;
+ for (int i = 0; i < num_iter; i+=2)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n -= 16;
+ }
+}
+
+/* Using an unpredicated store instruction within the loop. */
+void test5 (uint8_t *a, uint8_t *b, uint8_t *c, uint8_t *d, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_u8 (va, vb);
+ uint8x16_t vd = vaddq_x_u8 (va, vb, p);
+ vstrbq_u8 (d, vd);
+ n -= 16;
+ }
+}
+
+/* Using an unpredicated store outside the loop. */
+void test6 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+ vx = vaddq_u8 (vx, vc);
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ vstrbq_u8 (c, vx);
+}
+
+/* Using a VPR that gets modified within the loop. */
+void test9 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ p++;
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using a VPR that gets re-generated within the loop. */
+void test10 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ mve_pred16_t p = vctp32q (n);
+ while (n > 0)
+ {
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ p = vctp32q (n);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using vctp32q_m instead of vctp32q. */
+void test11 (int32_t *a, int32_t *b, int32_t *c, int n, mve_pred16_t p0)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q_m (n, p0);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using an unpredicated op with a scalar output, where the result is valid
+ outside the bb. This is invalid, because one of the inputs to the
+ unpredicated op is also unpredicated. */
+uint8_t test12 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+ uint8_t sum = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ uint8x16_t vc = vaddq_u8 (va, vb);
+ sum += vaddvq_u8 (vc);
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return sum;
+}
+
+/* Using an unpredicated vcmp to generate a new predicate value in the
+ loop and then using that VPR to predicate a store insn. */
+void test13 (int32_t *a, int32_t *b, int32x4_t vc, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_s32 (a);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_s32 (va, vb);
+ mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
+ vstrwq_p_s32 (c, vc, p1);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using an across-vector unpredicated instruction. "vb" is the risk. */
+uint16_t test14 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16x8_t vb = vldrhq_u16 (b);
+ uint16_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t va = vldrhq_z_u16 (a, p);
+ vb = vaddq_u16 (va, vb);
+ res = vaddvq_u16 (vb);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+/* Using an across-vector unpredicated instruction. "vc" is the risk. */
+uint16_t test15 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16x8_t vb = vldrhq_u16 (b);
+ uint16_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t va = vldrhq_z_u16 (a, p);
+ uint16x8_t vc = vaddq_u16 (va, vb);
+ res = vaddvaq_u16 (res, vc);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+uint16_t test16 (uint16_t *a, uint16_t *b, uint16_t *c, int n)
+{
+ uint16_t res =0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp16q (n);
+ uint16x8_t vb = vldrhq_u16 (b);
+ uint16x8_t va = vldrhq_z_u16 (a, p);
+ res = vaddvaq_u16 (res, vb);
+ res = vaddvaq_p_u16 (res, va, p);
+ c += 8;
+ a += 8;
+ b += 8;
+ n -= 8;
+ }
+ return res;
+}
+
+int test17 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+ int res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ res = vmaxvq (res, va);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+
+
+int test18 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+ int res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ res = vminvq (res, va);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+int test19 (int8_t *a, int8_t *b, int8_t *c, int n)
+{
+ int res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ res = vminavq (res, va);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+int test20 (uint8_t *a, uint8_t *b, uint8_t *c, int n)
+{
+ int res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ res = vminvq (res, va);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+uint8x16_t test21 (uint8_t *a, uint32_t *b, int n, uint8x16_t res)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ res = vshlcq_u8 (va, b, 1);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+int8x16_t test22 (int8_t *a, int32_t *b, int n, int8x16_t res)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ int8x16_t va = vldrbq_z_s8 (a, p);
+ res = vshlcq_s8 (va, b, 1);
+ n-=16;
+ a+=16;
+ }
+ return res;
+}
+
+/* Using an unsigned number of elements to count down from, with a >0*/
+void test23 (int32_t *a, int32_t *b, int32_t *c, unsigned int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c+=4;
+ a+=4;
+ b+=4;
+ n-=4;
+ }
+}
+
+/* Using an unsigned number of elements to count up to, with a <n*/
+void test24 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned int n)
+{
+ for (int i = 0; i < n; i+=16)
+ {
+ mve_pred16_t p = vctp8q (n-i);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n-=16;
+ }
+}
+
+
+/* Using an unsigned number of elements to count up to, with a <=n*/
+void test25 (uint8_t *a, uint8_t *b, uint8_t *c, unsigned int n)
+{
+ for (int i = 1; i <= n; i+=16)
+ {
+ mve_pred16_t p = vctp8q (n-i+1);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_x_u8 (va, vb, p);
+ vstrbq_p_u8 (c, vc, p);
+ n-=16;
+ }
+}
+/* Update n twice in the loop. */
+void test26 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n >= 1)
+ {
+ n-=4;
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_z_s32 (b, p);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ vstrwq_p_s32 (c, vc, p);
+ c+=4;
+ a+=4;
+ b+=4;
+ n-=4;
+ }
+}
+
+void test27 (int32_t *a, int32_t *c, int n)
+{
+ int32_t res = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_s32 (a);
+ res += vaddvq_s32 (va);
+ int32x4_t vc = vdupq_n_s32 (res);
+ vstrwq_p_s32 (c, vc, p);
+ a += 4;
+ n -= 4;
+ }
+}
+
+/* Using an unpredicated vcmp to generate a new predicate value in the
+ loop and then using it in a predicated store insn. */
+void test28 (int32_t *a, int32_t *b, int32_t *c, int n)
+{
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp32q (n);
+ int32x4_t va = vldrwq_z_s32 (a, p);
+ int32x4_t vb = vldrwq_s32 (b);
+ int32x4_t vc = vaddq_x_s32 (va, vb, p);
+ mve_pred16_t p1 = vcmpeqq_s32 (va, vc);
+ vstrwq_p_s32 (c, vc, p1);
+ c += 4;
+ a += 4;
+ b += 4;
+ n -= 4;
+ }
+}
+
+/* Using an unpredicated op with a scalar output, where the result is valid
+ outside the bb. The unpredicated lanes are not guaranteed zero, so would
+ affect the vaddv in the non-tail predicated case. */
+uint8_t test29 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx)
+{
+ uint8_t sum = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+ sum += vaddvq_u8 (vc);
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return sum;
+}
+
+/* Same as above, but with another scalar op between the unpredicated op and
+ the scalar op outside the loop. */
+uint8_t test30 (uint8_t *a, uint8_t *b, uint8_t *c, int n, uint8x16_t vx, int g)
+{
+ uint8_t sum = 0;
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_z_u8 (b, p);
+ uint8x16_t vc = vaddq_m_u8 (vx, va, vb, p);
+ sum += vaddvq_u8 (vc);
+ sum += g;
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return sum;
+}
+
+uint8_t test31 (uint8_t *a, uint8_t *b, int n)
+{
+ uint8_t res = 0;
+ uint8x16_t vc = vdupq_n_u8 (0);
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ vc = vaddq (vb, vc);
+ res = vgetq_lane (vc, 5);
+
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return res;
+}
+
+uint8_t test32 (uint8_t *a, uint8_t *b, int n)
+{
+ uint8_t res = 0;
+ uint8x16_t vc = vdupq_n_u8 (0);
+ while (n > 0)
+ {
+ mve_pred16_t p = vctp8q (n);
+ uint8x16_t va = vldrbq_z_u8 (a, p);
+ uint8x16_t vb = vldrbq_u8 (b);
+ vc = vaddq_m (vc, va, vc, p);
+ vc = vaddq (vb, vc);
+ res = vgetq_lane (vc, 5);
+
+ a += 16;
+ b += 16;
+ n -= 16;
+ }
+ return res;
+}
+
+/* { dg-final { scan-assembler-not "\tdlstp" } } */
+/* { dg-final { scan-assembler-not "\tletp" } } */