;; Machine description for AArch64 SVE.
;; Copyright (C) 2009-2020 Free Software Foundation, Inc.
;; Contributed by ARM Ltd.
;;
;; This file is part of GCC.
;;
;; GCC is free software; you can redistribute it and/or modify it
;; under the terms of the GNU General Public License as published by
;; the Free Software Foundation; either version 3, or (at your option)
;; any later version.
;;
;; GCC is distributed in the hope that it will be useful, but
;; WITHOUT ANY WARRANTY; without even the implied warranty of
;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;; General Public License for more details.
;;
;; You should have received a copy of the GNU General Public License
;; along with GCC; see the file COPYING3. If not see
;; .
;; The file is organised into the following sections (search for the full
;; line):
;;
;; == General notes
;; ---- Note on the handling of big-endian SVE
;; ---- Description of UNSPEC_PTEST
;; ---- Description of UNSPEC_PRED_Z
;; ---- Note on predicated integer arithemtic and UNSPEC_PRED_X
;; ---- Note on predicated FP arithmetic patterns and GP "strictness"
;; ---- Note on FFR handling
;;
;; == Moves
;; ---- Moves of single vectors
;; ---- Moves of multiple vectors
;; ---- Moves of predicates
;; ---- Moves relating to the FFR
;;
;; == Loads
;; ---- Normal contiguous loads
;; ---- Extending contiguous loads
;; ---- First-faulting contiguous loads
;; ---- First-faulting extending contiguous loads
;; ---- Non-temporal contiguous loads
;; ---- Normal gather loads
;; ---- Extending gather loads
;; ---- First-faulting gather loads
;; ---- First-faulting extending gather loads
;;
;; == Prefetches
;; ---- Contiguous prefetches
;; ---- Gather prefetches
;;
;; == Stores
;; ---- Normal contiguous stores
;; ---- Truncating contiguous stores
;; ---- Non-temporal contiguous stores
;; ---- Normal scatter stores
;; ---- Truncating scatter stores
;;
;; == Vector creation
;; ---- [INT,FP] Duplicate element
;; ---- [INT,FP] Initialize from individual elements
;; ---- [INT] Linear series
;; ---- [PRED] Duplicate element
;;
;; == Vector decomposition
;; ---- [INT,FP] Extract index
;; ---- [INT,FP] Extract active element
;; ---- [PRED] Extract index
;;
;; == Unary arithmetic
;; ---- [INT] General unary arithmetic corresponding to rtx codes
;; ---- [INT] General unary arithmetic corresponding to unspecs
;; ---- [INT] Sign and zero extension
;; ---- [INT] Truncation
;; ---- [INT] Logical inverse
;; ---- [FP<-INT] General unary arithmetic that maps to unspecs
;; ---- [FP] General unary arithmetic corresponding to unspecs
;; ---- [FP] Square root
;; ---- [FP] Reciprocal square root
;; ---- [PRED] Inverse
;; == Binary arithmetic
;; ---- [INT] General binary arithmetic corresponding to rtx codes
;; ---- [INT] Addition
;; ---- [INT] Subtraction
;; ---- [INT] Take address
;; ---- [INT] Absolute difference
;; ---- [INT] Saturating addition and subtraction
;; ---- [INT] Highpart multiplication
;; ---- [INT] Division
;; ---- [INT] Binary logical operations
;; ---- [INT] Binary logical operations (inverted second input)
;; ---- [INT] Shifts (rounding towards -Inf)
;; ---- [INT] Shifts (rounding towards 0)
;; ---- [FP<-INT] General binary arithmetic corresponding to unspecs
;; ---- [FP] General binary arithmetic corresponding to rtx codes
;; ---- [FP] General binary arithmetic corresponding to unspecs
;; ---- [FP] Addition
;; ---- [FP] Complex addition
;; ---- [FP] Subtraction
;; ---- [FP] Absolute difference
;; ---- [FP] Multiplication
;; ---- [FP] Division
;; ---- [FP] Binary logical operations
;; ---- [FP] Sign copying
;; ---- [FP] Maximum and minimum
;; ---- [PRED] Binary logical operations
;; ---- [PRED] Binary logical operations (inverted second input)
;; ---- [PRED] Binary logical operations (inverted result)
;;
;; == Ternary arithmetic
;; ---- [INT] MLA and MAD
;; ---- [INT] MLS and MSB
;; ---- [INT] Dot product
;; ---- [INT] Sum of absolute differences
;; ---- [INT] Matrix multiply-accumulate
;; ---- [FP] General ternary arithmetic corresponding to unspecs
;; ---- [FP] Complex multiply-add
;; ---- [FP] Trigonometric multiply-add
;; ---- [FP] Bfloat16 long ternary arithmetic (SF,BF,BF)
;; ---- [FP] Matrix multiply-accumulate
;;
;; == Comparisons and selects
;; ---- [INT,FP] Select based on predicates
;; ---- [INT,FP] Compare and select
;; ---- [INT] Comparisons
;; ---- [INT] While tests
;; ---- [FP] Direct comparisons
;; ---- [FP] Absolute comparisons
;; ---- [PRED] Select
;; ---- [PRED] Test bits
;;
;; == Reductions
;; ---- [INT,FP] Conditional reductions
;; ---- [INT] Tree reductions
;; ---- [FP] Tree reductions
;; ---- [FP] Left-to-right reductions
;;
;; == Permutes
;; ---- [INT,FP] General permutes
;; ---- [INT,FP] Special-purpose unary permutes
;; ---- [INT,FP] Special-purpose binary permutes
;; ---- [PRED] Special-purpose unary permutes
;; ---- [PRED] Special-purpose binary permutes
;;
;; == Conversions
;; ---- [INT<-INT] Packs
;; ---- [INT<-INT] Unpacks
;; ---- [INT<-FP] Conversions
;; ---- [INT<-FP] Packs
;; ---- [INT<-FP] Unpacks
;; ---- [FP<-INT] Conversions
;; ---- [FP<-INT] Packs
;; ---- [FP<-INT] Unpacks
;; ---- [FP<-FP] Packs
;; ---- [FP<-FP] Packs (bfloat16)
;; ---- [FP<-FP] Unpacks
;; ---- [PRED<-PRED] Packs
;; ---- [PRED<-PRED] Unpacks
;;
;; == Vector partitioning
;; ---- [PRED] Unary partitioning
;; ---- [PRED] Binary partitioning
;; ---- [PRED] Scalarization
;;
;; == Counting elements
;; ---- [INT] Count elements in a pattern (scalar)
;; ---- [INT] Increment by the number of elements in a pattern (scalar)
;; ---- [INT] Increment by the number of elements in a pattern (vector)
;; ---- [INT] Decrement by the number of elements in a pattern (scalar)
;; ---- [INT] Decrement by the number of elements in a pattern (vector)
;; ---- [INT] Count elements in a predicate (scalar)
;; ---- [INT] Increment by the number of elements in a predicate (scalar)
;; ---- [INT] Increment by the number of elements in a predicate (vector)
;; ---- [INT] Decrement by the number of elements in a predicate (scalar)
;; ---- [INT] Decrement by the number of elements in a predicate (vector)
;; =========================================================================
;; == General notes
;; =========================================================================
;;
;; -------------------------------------------------------------------------
;; ---- Note on the handling of big-endian SVE
;; -------------------------------------------------------------------------
;;
;; On big-endian systems, Advanced SIMD mov patterns act in the
;; same way as movdi or movti would: the first byte of memory goes
;; into the most significant byte of the register and the last byte
;; of memory goes into the least significant byte of the register.
;; This is the most natural ordering for Advanced SIMD and matches
;; the ABI layout for 64-bit and 128-bit vector types.
;;
;; As a result, the order of bytes within the register is what GCC
;; expects for a big-endian target, and subreg offsets therefore work
;; as expected, with the first element in memory having subreg offset 0
;; and the last element in memory having the subreg offset associated
;; with a big-endian lowpart. However, this ordering also means that
;; GCC's lane numbering does not match the architecture's numbering:
;; GCC always treats the element at the lowest address in memory
;; (subreg offset 0) as element 0, while the architecture treats
;; the least significant end of the register as element 0.
;;
;; The situation for SVE is different. We want the layout of the
;; SVE register to be same for mov as it is for maskload:
;; logically, a mov load must be indistinguishable from a
;; maskload whose mask is all true. We therefore need the
;; register layout to match LD1 rather than LDR. The ABI layout of
;; SVE types also matches LD1 byte ordering rather than LDR byte ordering.
;;
;; As a result, the architecture lane numbering matches GCC's lane
;; numbering, with element 0 always being the first in memory.
;; However:
;;
;; - Applying a subreg offset to a register does not give the element
;; that GCC expects: the first element in memory has the subreg offset
;; associated with a big-endian lowpart while the last element in memory
;; has subreg offset 0. We handle this via TARGET_CAN_CHANGE_MODE_CLASS.
;;
;; - We cannot use LDR and STR for spill slots that might be accessed
;; via subregs, since although the elements have the order GCC expects,
;; the order of the bytes within the elements is different. We instead
;; access spill slots via LD1 and ST1, using secondary reloads to
;; reserve a predicate register.
;;
;; -------------------------------------------------------------------------
;; ---- Description of UNSPEC_PTEST
;; -------------------------------------------------------------------------
;;
;; SVE provides a PTEST instruction for testing the active lanes of a
;; predicate and setting the flags based on the result. The associated
;; condition code tests are:
;;
;; - any (= ne): at least one active bit is set
;; - none (= eq): all active bits are clear (*)
;; - first (= mi): the first active bit is set
;; - nfrst (= pl): the first active bit is clear (*)
;; - last (= cc): the last active bit is set
;; - nlast (= cs): the last active bit is clear (*)
;;
;; where the conditions marked (*) are also true when there are no active
;; lanes (i.e. when the governing predicate is a PFALSE). The flags results
;; of a PTEST use the condition code mode CC_NZC.
;;
;; PTEST is always a .B operation (i.e. it always operates on VNx16BI).
;; This means that for other predicate modes, we need a governing predicate
;; in which all bits are defined.
;;
;; For example, most predicated .H operations ignore the odd bits of the
;; governing predicate, so that an active lane is represented by the
;; bits "1x" and an inactive lane by the bits "0x", where "x" can be
;; any value. To test a .H predicate, we instead need "10" and "00"
;; respectively, so that the condition only tests the even bits of the
;; predicate.
;;
;; Several instructions set the flags as a side-effect, in the same way
;; that a separate PTEST would. It's important for code quality that we
;; use these flags results as often as possible, particularly in the case
;; of WHILE* and RDFFR.
;;
;; Also, some of the instructions that set the flags are unpredicated
;; and instead implicitly test all .B, .H, .S or .D elements, as though
;; they were predicated on a PTRUE of that size. For example, a .S
;; WHILELO sets the flags in the same way as a PTEST with a .S PTRUE
;; would.
;;
;; We therefore need to represent PTEST operations in a way that
;; makes it easy to combine them with both predicated and unpredicated
;; operations, while using a VNx16BI governing predicate for all
;; predicate modes. We do this using:
;;
;; (unspec:CC_NZC [gp cast_gp ptrue_flag op] UNSPEC_PTEST)
;;
;; where:
;;
;; - GP is the real VNx16BI governing predicate
;;
;; - CAST_GP is GP cast to the mode of OP. All bits dropped by casting
;; GP to CAST_GP are guaranteed to be clear in GP.
;;
;; - PTRUE_FLAG is a CONST_INT (conceptually of mode SI) that has the value
;; SVE_KNOWN_PTRUE if we know that CAST_GP (rather than GP) is all-true and
;; SVE_MAYBE_NOT_PTRUE otherwise.
;;
;; - OP is the predicate we want to test, of the same mode as CAST_GP.
;;
;; -------------------------------------------------------------------------
;; ---- Description of UNSPEC_PRED_Z
;; -------------------------------------------------------------------------
;;
;; SVE integer comparisons are predicated and return zero for inactive
;; lanes. Sometimes we use them with predicates that are all-true and
;; sometimes we use them with general predicates.
;;
;; The integer comparisons also set the flags and so build-in the effect
;; of a PTEST. We therefore want to be able to combine integer comparison
;; patterns with PTESTs of the result. One difficulty with doing this is
;; that (as noted above) the PTEST is always a .B operation and so can place
;; stronger requirements on the governing predicate than the comparison does.
;;
;; For example, when applying a separate PTEST to the result of a full-vector
;; .H comparison, the PTEST must be predicated on a .H PTRUE instead of a
;; .B PTRUE. In constrast, the comparison might be predicated on either
;; a .H PTRUE or a .B PTRUE, since the values of odd-indexed predicate
;; bits don't matter for .H operations.
;;
;; We therefore can't rely on a full-vector comparison using the same
;; predicate register as a following PTEST. We instead need to remember
;; whether a comparison is known to be a full-vector comparison and use
;; this information in addition to a check for equal predicate registers.
;; At the same time, it's useful to have a common representation for all
;; integer comparisons, so that they can be handled by a single set of
;; patterns.
;;
;; We therefore take a similar approach to UNSPEC_PTEST above and use:
;;
;; (unspec: [gp ptrue_flag (code:M op0 op1)] UNSPEC_PRED_Z)
;;
;; where:
;;
;; - GP is the governing predicate, of mode
;;
;; - PTRUE_FLAG is a CONST_INT (conceptually of mode SI) that has the value
;; SVE_KNOWN_PTRUE if we know that GP is all-true and SVE_MAYBE_NOT_PTRUE
;; otherwise
;;
;; - CODE is the comparison code
;;
;; - OP0 and OP1 are the values being compared, of mode M
;;
;; The "Z" in UNSPEC_PRED_Z indicates that inactive lanes are zero.
;;
;; -------------------------------------------------------------------------
;; ---- Note on predicated integer arithemtic and UNSPEC_PRED_X
;; -------------------------------------------------------------------------
;;
;; Many SVE integer operations are predicated. We can generate them
;; from four sources:
;;
;; (1) Using normal unpredicated optabs. In this case we need to create
;; an all-true predicate register to act as the governing predicate
;; for the SVE instruction. There are no inactive lanes, and thus
;; the values of inactive lanes don't matter.
;;
;; (2) Using _x ACLE functions. In this case the function provides a
;; specific predicate and some lanes might be inactive. However,
;; as for (1), the values of the inactive lanes don't matter.
;; We can make extra lanes active without changing the behavior
;; (although for code-quality reasons we should avoid doing so
;; needlessly).
;;
;; (3) Using cond_* optabs that correspond to IFN_COND_* internal functions.
;; These optabs have a predicate operand that specifies which lanes are
;; active and another operand that provides the values of inactive lanes.
;;
;; (4) Using _m and _z ACLE functions. These functions map to the same
;; patterns as (3), with the _z functions setting inactive lanes to zero
;; and the _m functions setting the inactive lanes to one of the function
;; arguments.
;;
;; For (1) and (2) we need a way of attaching the predicate to a normal
;; unpredicated integer operation. We do this using:
;;
;; (unspec:M [pred (code:M (op0 op1 ...))] UNSPEC_PRED_X)
;;
;; where (code:M (op0 op1 ...)) is the normal integer operation and PRED
;; is a predicate of mode . PRED might or might not be a PTRUE;
;; it always is for (1), but might not be for (2).
;;
;; The unspec as a whole has the same value as (code:M ...) when PRED is
;; all-true. It is always semantically valid to replace PRED with a PTRUE,
;; but as noted above, we should only do so if there's a specific benefit.
;;
;; (The "_X" in the unspec is named after the ACLE functions in (2).)
;;
;; For (3) and (4) we can simply use the SVE port's normal representation
;; of a predicate-based select:
;;
;; (unspec:M [pred (code:M (op0 op1 ...)) inactive] UNSPEC_SEL)
;;
;; where INACTIVE specifies the values of inactive lanes.
;;
;; We can also use the UNSPEC_PRED_X wrapper in the UNSPEC_SEL rather
;; than inserting the integer operation directly. This is mostly useful
;; if we want the combine pass to merge an integer operation with an explicit
;; vcond_mask (in other words, with a following SEL instruction). However,
;; it's generally better to merge such operations at the gimple level
;; using (3).
;;
;; -------------------------------------------------------------------------
;; ---- Note on predicated FP arithmetic patterns and GP "strictness"
;; -------------------------------------------------------------------------
;;
;; Most SVE floating-point operations are predicated. We can generate
;; them from four sources:
;;
;; (1) Using normal unpredicated optabs. In this case we need to create
;; an all-true predicate register to act as the governing predicate
;; for the SVE instruction. There are no inactive lanes, and thus
;; the values of inactive lanes don't matter.
;;
;; (2) Using _x ACLE functions. In this case the function provides a
;; specific predicate and some lanes might be inactive. However,
;; as for (1), the values of the inactive lanes don't matter.
;;
;; The instruction must have the same exception behavior as the
;; function call unless things like command-line flags specifically
;; allow otherwise. For example, with -ffast-math, it is OK to
;; raise exceptions for inactive lanes, but normally it isn't.
;;
;; (3) Using cond_* optabs that correspond to IFN_COND_* internal functions.
;; These optabs have a predicate operand that specifies which lanes are
;; active and another operand that provides the values of inactive lanes.
;;
;; (4) Using _m and _z ACLE functions. These functions map to the same
;; patterns as (3), with the _z functions setting inactive lanes to zero
;; and the _m functions setting the inactive lanes to one of the function
;; arguments.
;;
;; So:
;;
;; - In (1), the predicate is known to be all true and the pattern can use
;; unpredicated operations where available.
;;
;; - In (2), the predicate might or might not be all true. The pattern can
;; use unpredicated instructions if the predicate is all-true or if things
;; like command-line flags allow exceptions for inactive lanes.
;;
;; - (3) and (4) represent a native SVE predicated operation. Some lanes
;; might be inactive and inactive lanes of the result must have specific
;; values. There is no scope for using unpredicated instructions (and no
;; reason to want to), so the question about command-line flags doesn't
;; arise.
;;
;; It would be inaccurate to model (2) as an rtx code like (sqrt ...)
;; in combination with a separate predicate operand, e.g.
;;
;; (unspec [(match_operand: 1 "register_operand" "Upl")
;; (sqrt:SVE_FULL_F 2 "register_operand" "w")]
;; ....)
;;
;; because (sqrt ...) can raise an exception for any lane, including
;; inactive ones. We therefore need to use an unspec instead.
;;
;; Also, (2) requires some way of distinguishing the case in which the
;; predicate might have inactive lanes and cannot be changed from the
;; case in which the predicate has no inactive lanes or can be changed.
;; This information is also useful when matching combined FP patterns
;; in which the predicates might not be equal.
;;
;; We therefore model FP operations as an unspec of the form:
;;
;; (unspec [pred strictness op0 op1 ...] UNSPEC_COND_)
;;
;; where:
;;
;; - PRED is the governing predicate.
;;
;; - STRICTNESS is a CONST_INT that conceptually has mode SI. It has the
;; value SVE_STRICT_GP if PRED might have inactive lanes and if those
;; lanes must remain inactive. It has the value SVE_RELAXED_GP otherwise.
;;
;; - OP0 OP1 ... are the normal input operands to the operation.
;;
;; - MNEMONIC is the mnemonic of the associated SVE instruction.
;;
;; For (3) and (4), we combine these operations with an UNSPEC_SEL
;; that selects between the result of the FP operation and the "else"
;; value. (This else value is a merge input for _m ACLE functions
;; and zero for _z ACLE functions.) The outer pattern then has the form:
;;
;; (unspec [pred fp_operation else_value] UNSPEC_SEL)
;;
;; This means that the patterns for (3) and (4) have two predicates:
;; one for the FP operation itself and one for the UNSPEC_SEL.
;; This pattern is equivalent to the result of combining an instance
;; of (1) or (2) with a separate vcond instruction, so these patterns
;; are useful as combine targets too.
;;
;; However, in the combine case, the instructions that we want to
;; combine might use different predicates. Then:
;;
;; - Some of the active lanes of the FP operation might be discarded
;; by the UNSPEC_SEL. It's OK to drop the FP operation on those lanes,
;; even for SVE_STRICT_GP, since the operations on those lanes are
;; effectively dead code.
;;
;; - Some of the inactive lanes of the FP operation might be selected
;; by the UNSPEC_SEL, giving unspecified values for those lanes.
;; SVE_RELAXED_GP lets us extend the FP operation to cover these
;; extra lanes, but SVE_STRICT_GP does not.
;;
;; Thus SVE_RELAXED_GP allows us to ignore the predicate on the FP operation
;; and operate on exactly the lanes selected by the UNSPEC_SEL predicate.
;; This typically leads to patterns like:
;;
;; (unspec [(match_operand 1 "register_operand" "Upl")
;; (unspec [(match_operand N)
;; (const_int SVE_RELAXED_GP)
;; ...]
;; UNSPEC_COND_)
;; ...])
;;
;; where operand N is allowed to be anything. These instructions then
;; have rewrite rules to replace operand N with operand 1, which gives the
;; instructions a canonical form and means that the original operand N is
;; not kept live unnecessarily.
;;
;; In contrast, SVE_STRICT_GP only allows the UNSPEC_SEL predicate to be
;; a subset of the FP operation predicate. This case isn't interesting
;; for FP operations that have an all-true predicate, since such operations
;; use SVE_RELAXED_GP instead. And it is not possible for instruction
;; conditions to track the subset relationship for arbitrary registers.
;; So in practice, the only useful case for SVE_STRICT_GP is the one
;; in which the predicates match:
;;
;; (unspec [(match_operand 1 "register_operand" "Upl")
;; (unspec [(match_dup 1)
;; (const_int SVE_STRICT_GP)
;; ...]
;; UNSPEC_COND_)
;; ...])
;;
;; This pattern would also be correct for SVE_RELAXED_GP, but it would
;; be redundant with the one above. However, if the combine pattern
;; has multiple FP operations, using a match_operand allows combinations
;; of SVE_STRICT_GP and SVE_RELAXED_GP in the same operation, provided
;; that the predicates are the same:
;;
;; (unspec [(match_operand 1 "register_operand" "Upl")
;; (...
;; (unspec [(match_dup 1)
;; (match_operand:SI N "aarch64_sve_gp_strictness")
;; ...]
;; UNSPEC_COND_)
;; (unspec [(match_dup 1)
;; (match_operand:SI M "aarch64_sve_gp_strictness")
;; ...]
;; UNSPEC_COND_) ...)
;; ...])
;;
;; The fully-relaxed version of this pattern is:
;;
;; (unspec [(match_operand 1 "register_operand" "Upl")
;; (...
;; (unspec [(match_operand:SI N)
;; (const_int SVE_RELAXED_GP)
;; ...]
;; UNSPEC_COND_)
;; (unspec [(match_operand:SI M)
;; (const_int SVE_RELAXED_GP)
;; ...]
;; UNSPEC_COND_) ...)
;; ...])
;;
;; -------------------------------------------------------------------------
;; ---- Note on FFR handling
;; -------------------------------------------------------------------------
;;
;; Logically we want to divide FFR-related instructions into regions
;; that contain exactly one of:
;;
;; - a single write to the FFR
;; - any number of reads from the FFR (but only one read is likely)
;; - any number of LDFF1 and LDNF1 instructions
;;
;; However, LDFF1 and LDNF1 instructions should otherwise behave like
;; normal loads as far as possible. This means that they should be
;; schedulable within a region in the same way that LD1 would be,
;; and they should be deleted as dead if the result is unused. The loads
;; should therefore not write to the FFR, since that would both serialize
;; the loads with respect to each other and keep the loads live for any
;; later RDFFR.
;;
;; We get around this by using a fake "FFR token" (FFRT) to help describe
;; the dependencies. Writing to the FFRT starts a new "FFRT region",
;; while using the FFRT keeps the instruction within its region.
;; Specifically:
;;
;; - Writes start a new FFRT region as well as setting the FFR:
;;
;; W1: parallel (FFRT = , FFR = )
;;
;; - Loads use an LD1-like instruction that also uses the FFRT, so that the
;; loads stay within the same FFRT region:
;;
;; L1: load data while using the FFRT
;;
;; In addition, any FFRT region that includes a load also has at least one
;; instance of:
;;
;; L2: FFR = update(FFR, FFRT) [type == no_insn]
;;
;; to make it clear that the region both reads from and writes to the FFR.
;;
;; - Reads do the following:
;;
;; R1: FFRT = FFR [type == no_insn]
;; R2: read from the FFRT
;; R3: FFRT = update(FFRT) [type == no_insn]
;;
;; R1 and R3 both create new FFRT regions, so that previous LDFF1s and
;; LDNF1s cannot move forwards across R1 and later LDFF1s and LDNF1s
;; cannot move backwards across R3.
;;
;; This way, writes are only kept alive by later loads or reads,
;; and write/read pairs fold normally. For two consecutive reads,
;; the first R3 is made dead by the second R1, which in turn becomes
;; redundant with the first R1. We then have:
;;
;; first R1: FFRT = FFR
;; first read from the FFRT
;; second read from the FFRT
;; second R3: FFRT = update(FFRT)
;;
;; i.e. the two FFRT regions collapse into a single one with two
;; independent reads.
;;
;; The model still prevents some valid optimizations though. For example,
;; if all loads in an FFRT region are deleted as dead, nothing would remove
;; the L2 instructions.
;; =========================================================================
;; == Moves
;; =========================================================================
;; -------------------------------------------------------------------------
;; ---- Moves of single vectors
;; -------------------------------------------------------------------------
;; Includes:
;; - MOV (including aliases)
;; - LD1B (contiguous form)
;; - LD1D ( " " )
;; - LD1H ( " " )
;; - LD1W ( " " )
;; - LDR
;; - ST1B (contiguous form)
;; - ST1D ( " " )
;; - ST1H ( " " )
;; - ST1W ( " " )
;; - STR
;; -------------------------------------------------------------------------
(define_expand "mov"
[(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
(match_operand:SVE_ALL 1 "general_operand"))]
"TARGET_SVE"
{
/* Use the predicated load and store patterns where possible.
This is required for big-endian targets (see the comment at the
head of the file) and increases the addressing choices for
little-endian. */
if ((MEM_P (operands[0]) || MEM_P (operands[1]))
&& can_create_pseudo_p ())
{
aarch64_expand_sve_mem_move (operands[0], operands[1], mode);
DONE;
}
if (CONSTANT_P (operands[1]))
{
aarch64_expand_mov_immediate (operands[0], operands[1]);
DONE;
}
/* Optimize subregs on big-endian targets: we can use REV[BHW]
instead of going through memory. */
if (BYTES_BIG_ENDIAN
&& aarch64_maybe_expand_sve_subreg_move (operands[0], operands[1]))
DONE;
}
)
(define_expand "movmisalign"
[(set (match_operand:SVE_ALL 0 "nonimmediate_operand")
(match_operand:SVE_ALL 1 "general_operand"))]
"TARGET_SVE"
{
/* Equivalent to a normal move for our purpooses. */
emit_move_insn (operands[0], operands[1]);
DONE;
}
)
;; Unpredicated moves that can use LDR and STR, i.e. full vectors for which
;; little-endian ordering is acceptable. Only allow memory operations during
;; and after RA; before RA we want the predicated load and store patterns to
;; be used instead.
(define_insn "*aarch64_sve_mov_ldr_str"
[(set (match_operand:SVE_FULL 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
(match_operand:SVE_FULL 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
"TARGET_SVE
&& (mode == VNx16QImode || !BYTES_BIG_ENDIAN)
&& ((lra_in_progress || reload_completed)
|| (register_operand (operands[0], mode)
&& nonmemory_operand (operands[1], mode)))"
"@
ldr\t%0, %1
str\t%1, %0
mov\t%0.d, %1.d
* return aarch64_output_sve_mov_immediate (operands[1]);"
)
;; Unpredicated moves that cannot use LDR and STR, i.e. partial vectors
;; or vectors for which little-endian ordering isn't acceptable. Memory
;; accesses require secondary reloads.
(define_insn "*aarch64_sve_mov_no_ldr_str"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w, w")
(match_operand:SVE_ALL 1 "aarch64_nonmemory_operand" "w, Dn"))]
"TARGET_SVE
&& mode != VNx16QImode
&& (BYTES_BIG_ENDIAN
|| maybe_ne (BYTES_PER_SVE_VECTOR, GET_MODE_SIZE (mode)))"
"@
mov\t%0.d, %1.d
* return aarch64_output_sve_mov_immediate (operands[1]);"
)
;; Handle memory reloads for modes that can't use LDR and STR. We use
;; byte PTRUE for all modes to try to encourage reuse. This pattern
;; needs constraints because it is returned by TARGET_SECONDARY_RELOAD.
(define_expand "aarch64_sve_reload_mem"
[(parallel
[(set (match_operand 0)
(match_operand 1))
(clobber (match_operand:VNx16BI 2 "register_operand" "=Upl"))])]
"TARGET_SVE"
{
/* Create a PTRUE. */
emit_move_insn (operands[2], CONSTM1_RTX (VNx16BImode));
/* Refer to the PTRUE in the appropriate mode for this move. */
machine_mode mode = GET_MODE (operands[0]);
rtx pred = gen_lowpart (aarch64_sve_pred_mode (mode), operands[2]);
/* Emit a predicated load or store. */
aarch64_emit_sve_pred_move (operands[0], pred, operands[1]);
DONE;
}
)
;; A predicated move in which the predicate is known to be all-true.
;; Note that this pattern is generated directly by aarch64_emit_sve_pred_move,
;; so changes to this pattern will need changes there as well.
(define_insn_and_split "@aarch64_pred_mov"
[(set (match_operand:SVE_ALL 0 "nonimmediate_operand" "=w, w, m")
(unspec:SVE_ALL
[(match_operand: 1 "register_operand" "Upl, Upl, Upl")
(match_operand:SVE_ALL 2 "nonimmediate_operand" "w, m, w")]
UNSPEC_PRED_X))]
"TARGET_SVE
&& (register_operand (operands[0], mode)
|| register_operand (operands[2], mode))"
"@
#
ld1\t%0., %1/z, %2
st1\t%2., %1, %0"
"&& register_operand (operands[0], mode)
&& register_operand (operands[2], mode)"
[(set (match_dup 0) (match_dup 2))]
)
;; A pattern for optimizing SUBREGs that have a reinterpreting effect
;; on big-endian targets; see aarch64_maybe_expand_sve_subreg_move
;; for details. We use a special predicate for operand 2 to reduce
;; the number of patterns.
(define_insn_and_split "*aarch64_sve_mov_subreg_be"
[(set (match_operand:SVE_ALL 0 "aarch64_sve_nonimmediate_operand" "=w")
(unspec:SVE_ALL
[(match_operand:VNx16BI 1 "register_operand" "Upl")
(match_operand 2 "aarch64_any_register_operand" "w")]
UNSPEC_REV_SUBREG))]
"TARGET_SVE && BYTES_BIG_ENDIAN"
"#"
"&& reload_completed"
[(const_int 0)]
{
aarch64_split_sve_subreg_move (operands[0], operands[1], operands[2]);
DONE;
}
)
;; Reinterpret operand 1 in operand 0's mode, without changing its contents.
;; This is equivalent to a subreg on little-endian targets but not for
;; big-endian; see the comment at the head of the file for details.
(define_expand "@aarch64_sve_reinterpret"
[(set (match_operand:SVE_ALL 0 "register_operand")
(unspec:SVE_ALL
[(match_operand 1 "aarch64_any_register_operand")]
UNSPEC_REINTERPRET))]
"TARGET_SVE"
{
machine_mode src_mode = GET_MODE (operands[1]);
if (targetm.can_change_mode_class (mode, src_mode, FP_REGS))
{
emit_move_insn (operands[0], gen_lowpart (mode, operands[1]));
DONE;
}
}
)
;; A pattern for handling type punning on big-endian targets. We use a
;; special predicate for operand 1 to reduce the number of patterns.
(define_insn_and_split "*aarch64_sve_reinterpret"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(unspec:SVE_ALL
[(match_operand 1 "aarch64_any_register_operand" "w")]
UNSPEC_REINTERPRET))]
"TARGET_SVE"
"#"
"&& reload_completed"
[(set (match_dup 0) (match_dup 1))]
{
operands[1] = aarch64_replace_reg_mode (operands[1], mode);
}
)
;; -------------------------------------------------------------------------
;; ---- Moves of multiple vectors
;; -------------------------------------------------------------------------
;; All patterns in this section are synthetic and split to real
;; instructions after reload.
;; -------------------------------------------------------------------------
(define_expand "mov"
[(set (match_operand:SVE_STRUCT 0 "nonimmediate_operand")
(match_operand:SVE_STRUCT 1 "general_operand"))]
"TARGET_SVE"
{
/* Big-endian loads and stores need to be done via LD1 and ST1;
see the comment at the head of the file for details. */
if ((MEM_P (operands[0]) || MEM_P (operands[1]))
&& BYTES_BIG_ENDIAN)
{
gcc_assert (can_create_pseudo_p ());
aarch64_expand_sve_mem_move (operands[0], operands[1], mode);
DONE;
}
if (CONSTANT_P (operands[1]))
{
aarch64_expand_mov_immediate (operands[0], operands[1]);
DONE;
}
}
)
;; Unpredicated structure moves (little-endian).
(define_insn "*aarch64_sve_mov_le"
[(set (match_operand:SVE_STRUCT 0 "aarch64_sve_nonimmediate_operand" "=w, Utr, w, w")
(match_operand:SVE_STRUCT 1 "aarch64_sve_general_operand" "Utr, w, w, Dn"))]
"TARGET_SVE && !BYTES_BIG_ENDIAN"
"#"
[(set_attr "length" "")]
)
;; Unpredicated structure moves (big-endian). Memory accesses require
;; secondary reloads.
(define_insn "*aarch64_sve_mov_be"
[(set (match_operand:SVE_STRUCT 0 "register_operand" "=w, w")
(match_operand:SVE_STRUCT 1 "aarch64_nonmemory_operand" "w, Dn"))]
"TARGET_SVE && BYTES_BIG_ENDIAN"
"#"
[(set_attr "length" "")]
)
;; Split unpredicated structure moves into pieces. This is the same
;; for both big-endian and little-endian code, although it only needs
;; to handle memory operands for little-endian code.
(define_split
[(set (match_operand:SVE_STRUCT 0 "aarch64_sve_nonimmediate_operand")
(match_operand:SVE_STRUCT 1 "aarch64_sve_general_operand"))]
"TARGET_SVE && reload_completed"
[(const_int 0)]
{
rtx dest = operands[0];
rtx src = operands[1];
if (REG_P (dest) && REG_P (src))
aarch64_simd_emit_reg_reg_move (operands, mode, );
else
for (unsigned int i = 0; i < ; ++i)
{
rtx subdest = simplify_gen_subreg (mode, dest, mode,
i * BYTES_PER_SVE_VECTOR);
rtx subsrc = simplify_gen_subreg (mode, src, mode,
i * BYTES_PER_SVE_VECTOR);
emit_insn (gen_rtx_SET (subdest, subsrc));
}
DONE;
}
)
;; Predicated structure moves. This works for both endiannesses but in
;; practice is only useful for big-endian.
(define_insn_and_split "@aarch64_pred_mov"
[(set (match_operand:SVE_STRUCT 0 "aarch64_sve_struct_nonimmediate_operand" "=w, w, Utx")
(unspec:SVE_STRUCT
[(match_operand: 1 "register_operand" "Upl, Upl, Upl")
(match_operand:SVE_STRUCT 2 "aarch64_sve_struct_nonimmediate_operand" "w, Utx, w")]
UNSPEC_PRED_X))]
"TARGET_SVE
&& (register_operand (operands[0], mode)
|| register_operand (operands[2], mode))"
"#"
"&& reload_completed"
[(const_int 0)]
{
for (unsigned int i = 0; i < ; ++i)
{
rtx subdest = simplify_gen_subreg (mode, operands[0],
mode,
i * BYTES_PER_SVE_VECTOR);
rtx subsrc = simplify_gen_subreg (mode, operands[2],
mode,
i * BYTES_PER_SVE_VECTOR);
aarch64_emit_sve_pred_move (subdest, operands[1], subsrc);
}
DONE;
}
[(set_attr "length" "")]
)
;; -------------------------------------------------------------------------
;; ---- Moves of predicates
;; -------------------------------------------------------------------------
;; Includes:
;; - MOV
;; - LDR
;; - PFALSE
;; - PTRUE
;; - PTRUES
;; - STR
;; -------------------------------------------------------------------------
(define_expand "mov"
[(set (match_operand:PRED_ALL 0 "nonimmediate_operand")
(match_operand:PRED_ALL 1 "general_operand"))]
"TARGET_SVE"
{
if (GET_CODE (operands[0]) == MEM)
operands[1] = force_reg (mode, operands[1]);
if (CONSTANT_P (operands[1]))
{
aarch64_expand_mov_immediate (operands[0], operands[1]);
DONE;
}
}
)
(define_insn "*aarch64_sve_mov"
[(set (match_operand:PRED_ALL 0 "nonimmediate_operand" "=Upa, m, Upa, Upa")
(match_operand:PRED_ALL 1 "aarch64_mov_operand" "Upa, Upa, m, Dn"))]
"TARGET_SVE
&& (register_operand (operands[0], mode)
|| register_operand (operands[1], mode))"
"@
mov\t%0.b, %1.b
str\t%1, %0
ldr\t%0, %1
* return aarch64_output_sve_mov_immediate (operands[1]);"
)
;; Match PTRUES Pn.B when both the predicate and flags are useful.
(define_insn_and_rewrite "*aarch64_sve_ptruevnx16bi_cc"
[(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
[(match_operand 2)
(match_operand 3)
(const_int SVE_KNOWN_PTRUE)
(match_operator:VNx16BI 1 "aarch64_sve_ptrue_svpattern_immediate"
[(unspec:VNx16BI
[(match_operand:SI 4 "const_int_operand")
(match_operand:VNx16BI 5 "aarch64_simd_imm_zero")]
UNSPEC_PTRUE)])]
UNSPEC_PTEST))
(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
(match_dup 1))]
"TARGET_SVE"
{
return aarch64_output_sve_ptrues (operands[1]);
}
"&& (!CONSTANT_P (operands[2]) || !CONSTANT_P (operands[3]))"
{
operands[2] = operands[3] = CONSTM1_RTX (VNx16BImode);
}
)
;; Match PTRUES Pn.[HSD] when both the predicate and flags are useful.
(define_insn_and_rewrite "*aarch64_sve_ptrue_cc"
[(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
[(match_operand 2)
(match_operand 3)
(const_int SVE_KNOWN_PTRUE)
(subreg:PRED_HSD
(match_operator:VNx16BI 1 "aarch64_sve_ptrue_svpattern_immediate"
[(unspec:VNx16BI
[(match_operand:SI 4 "const_int_operand")
(match_operand:PRED_HSD 5 "aarch64_simd_imm_zero")]
UNSPEC_PTRUE)]) 0)]
UNSPEC_PTEST))
(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
(match_dup 1))]
"TARGET_SVE"
{
return aarch64_output_sve_ptrues (operands[1]);
}
"&& (!CONSTANT_P (operands[2]) || !CONSTANT_P (operands[3]))"
{
operands[2] = CONSTM1_RTX (VNx16BImode);
operands[3] = CONSTM1_RTX (mode);
}
)
;; Match PTRUES Pn.B when only the flags result is useful (which is
;; a way of testing VL).
(define_insn_and_rewrite "*aarch64_sve_ptruevnx16bi_ptest"
[(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
[(match_operand 2)
(match_operand 3)
(const_int SVE_KNOWN_PTRUE)
(match_operator:VNx16BI 1 "aarch64_sve_ptrue_svpattern_immediate"
[(unspec:VNx16BI
[(match_operand:SI 4 "const_int_operand")
(match_operand:VNx16BI 5 "aarch64_simd_imm_zero")]
UNSPEC_PTRUE)])]
UNSPEC_PTEST))
(clobber (match_scratch:VNx16BI 0 "=Upa"))]
"TARGET_SVE"
{
return aarch64_output_sve_ptrues (operands[1]);
}
"&& (!CONSTANT_P (operands[2]) || !CONSTANT_P (operands[3]))"
{
operands[2] = operands[3] = CONSTM1_RTX (VNx16BImode);
}
)
;; Match PTRUES Pn.[HWD] when only the flags result is useful (which is
;; a way of testing VL).
(define_insn_and_rewrite "*aarch64_sve_ptrue_ptest"
[(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
[(match_operand 2)
(match_operand 3)
(const_int SVE_KNOWN_PTRUE)
(subreg:PRED_HSD
(match_operator:VNx16BI 1 "aarch64_sve_ptrue_svpattern_immediate"
[(unspec:VNx16BI
[(match_operand:SI 4 "const_int_operand")
(match_operand:PRED_HSD 5 "aarch64_simd_imm_zero")]
UNSPEC_PTRUE)]) 0)]
UNSPEC_PTEST))
(clobber (match_scratch:VNx16BI 0 "=Upa"))]
"TARGET_SVE"
{
return aarch64_output_sve_ptrues (operands[1]);
}
"&& (!CONSTANT_P (operands[2]) || !CONSTANT_P (operands[3]))"
{
operands[2] = CONSTM1_RTX (VNx16BImode);
operands[3] = CONSTM1_RTX (mode);
}
)
;; -------------------------------------------------------------------------
;; ---- Moves relating to the FFR
;; -------------------------------------------------------------------------
;; RDFFR
;; RDFFRS
;; SETFFR
;; WRFFR
;; -------------------------------------------------------------------------
;; [W1 in the block comment above about FFR handling]
;;
;; Write to the FFR and start a new FFRT scheduling region.
(define_insn "aarch64_wrffr"
[(set (reg:VNx16BI FFR_REGNUM)
(match_operand:VNx16BI 0 "aarch64_simd_reg_or_minus_one" "Dm, Upa"))
(set (reg:VNx16BI FFRT_REGNUM)
(unspec:VNx16BI [(match_dup 0)] UNSPEC_WRFFR))]
"TARGET_SVE"
"@
setffr
wrffr\t%0.b"
)
;; [L2 in the block comment above about FFR handling]
;;
;; Introduce a read from and write to the FFR in the current FFRT region,
;; so that the FFR value is live on entry to the region and so that the FFR
;; value visibly changes within the region. This is used (possibly multiple
;; times) in an FFRT region that includes LDFF1 or LDNF1 instructions.
(define_insn "aarch64_update_ffr_for_load"
[(set (reg:VNx16BI FFR_REGNUM)
(unspec:VNx16BI [(reg:VNx16BI FFRT_REGNUM)
(reg:VNx16BI FFR_REGNUM)] UNSPEC_UPDATE_FFR))]
"TARGET_SVE"
""
[(set_attr "type" "no_insn")]
)
;; [R1 in the block comment above about FFR handling]
;;
;; Notionally copy the FFR to the FFRT, so that the current FFR value
;; can be read from there by the RDFFR instructions below. This acts
;; as a scheduling barrier for earlier LDFF1 and LDNF1 instructions and
;; creates a natural dependency with earlier writes.
(define_insn "aarch64_copy_ffr_to_ffrt"
[(set (reg:VNx16BI FFRT_REGNUM)
(reg:VNx16BI FFR_REGNUM))]
"TARGET_SVE"
""
[(set_attr "type" "no_insn")]
)
;; [R2 in the block comment above about FFR handling]
;;
;; Read the FFR via the FFRT.
(define_insn "aarch64_rdffr"
[(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
(reg:VNx16BI FFRT_REGNUM))]
"TARGET_SVE"
"rdffr\t%0.b"
)
;; Likewise with zero predication.
(define_insn "aarch64_rdffr_z"
[(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
(and:VNx16BI
(reg:VNx16BI FFRT_REGNUM)
(match_operand:VNx16BI 1 "register_operand" "Upa")))]
"TARGET_SVE"
"rdffr\t%0.b, %1/z"
)
;; Read the FFR to test for a fault, without using the predicate result.
(define_insn "*aarch64_rdffr_z_ptest"
[(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
[(match_operand:VNx16BI 1 "register_operand" "Upa")
(match_dup 1)
(match_operand:SI 2 "aarch64_sve_ptrue_flag")
(and:VNx16BI
(reg:VNx16BI FFRT_REGNUM)
(match_dup 1))]
UNSPEC_PTEST))
(clobber (match_scratch:VNx16BI 0 "=Upa"))]
"TARGET_SVE"
"rdffrs\t%0.b, %1/z"
)
;; Same for unpredicated RDFFR when tested with a known PTRUE.
(define_insn "*aarch64_rdffr_ptest"
[(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
[(match_operand:VNx16BI 1 "register_operand" "Upa")
(match_dup 1)
(const_int SVE_KNOWN_PTRUE)
(reg:VNx16BI FFRT_REGNUM)]
UNSPEC_PTEST))
(clobber (match_scratch:VNx16BI 0 "=Upa"))]
"TARGET_SVE"
"rdffrs\t%0.b, %1/z"
)
;; Read the FFR with zero predication and test the result.
(define_insn "*aarch64_rdffr_z_cc"
[(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
[(match_operand:VNx16BI 1 "register_operand" "Upa")
(match_dup 1)
(match_operand:SI 2 "aarch64_sve_ptrue_flag")
(and:VNx16BI
(reg:VNx16BI FFRT_REGNUM)
(match_dup 1))]
UNSPEC_PTEST))
(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
(and:VNx16BI
(reg:VNx16BI FFRT_REGNUM)
(match_dup 1)))]
"TARGET_SVE"
"rdffrs\t%0.b, %1/z"
)
;; Same for unpredicated RDFFR when tested with a known PTRUE.
(define_insn "*aarch64_rdffr_cc"
[(set (reg:CC_NZC CC_REGNUM)
(unspec:CC_NZC
[(match_operand:VNx16BI 1 "register_operand" "Upa")
(match_dup 1)
(const_int SVE_KNOWN_PTRUE)
(reg:VNx16BI FFRT_REGNUM)]
UNSPEC_PTEST))
(set (match_operand:VNx16BI 0 "register_operand" "=Upa")
(reg:VNx16BI FFRT_REGNUM))]
"TARGET_SVE"
"rdffrs\t%0.b, %1/z"
)
;; [R3 in the block comment above about FFR handling]
;;
;; Arbitrarily update the FFRT after a read from the FFR. This acts as
;; a scheduling barrier for later LDFF1 and LDNF1 instructions.
(define_insn "aarch64_update_ffrt"
[(set (reg:VNx16BI FFRT_REGNUM)
(unspec:VNx16BI [(reg:VNx16BI FFRT_REGNUM)] UNSPEC_UPDATE_FFRT))]
"TARGET_SVE"
""
[(set_attr "type" "no_insn")]
)
;; =========================================================================
;; == Loads
;; =========================================================================
;; -------------------------------------------------------------------------
;; ---- Normal contiguous loads
;; -------------------------------------------------------------------------
;; Includes contiguous forms of:
;; - LD1B
;; - LD1D
;; - LD1H
;; - LD1W
;; - LD2B
;; - LD2D
;; - LD2H
;; - LD2W
;; - LD3B
;; - LD3D
;; - LD3H
;; - LD3W
;; - LD4B
;; - LD4D
;; - LD4H
;; - LD4W
;; -------------------------------------------------------------------------
;; Predicated LD1.
(define_insn "maskload"
[(set (match_operand:SVE_ALL 0 "register_operand" "=w")
(unspec:SVE_ALL
[(match_operand: 2 "register_operand" "Upl")
(match_operand:SVE_ALL 1 "memory_operand" "m")]
UNSPEC_LD1_SVE))]
"TARGET_SVE"
"ld1\t%0., %2/z, %1"
)
;; Unpredicated LD[234].
(define_expand "vec_load_lanes"
[(set (match_operand:SVE_STRUCT 0 "register_operand")
(unspec:SVE_STRUCT
[(match_dup 2)
(match_operand:SVE_STRUCT 1 "memory_operand")]
UNSPEC_LDN))]
"TARGET_SVE"
{
operands[2] = aarch64_ptrue_reg (mode);
}
)
;; Predicated LD[234].
(define_insn "vec_mask_load_lanes"
[(set (match_operand:SVE_STRUCT 0 "register_operand" "=w")
(unspec:SVE_STRUCT
[(match_operand: 2 "register_operand" "Upl")
(match_operand:SVE_STRUCT 1 "memory_operand" "m")]
UNSPEC_LDN))]
"TARGET_SVE"
"ld\t%0, %2/z, %1"
)
;; -------------------------------------------------------------------------
;; ---- Extending contiguous loads
;; -------------------------------------------------------------------------
;; Includes contiguous forms of:
;; LD1B
;; LD1H
;; LD1SB
;; LD1SH
;; LD1SW
;; LD1W
;; -------------------------------------------------------------------------
;; Predicated load and extend, with 8 elements per 128-bit block.
(define_insn_and_rewrite "@aarch64_load_"
[(set (match_operand:SVE_HSDI 0 "register_operand" "=w")
(unspec:SVE_HSDI
[(match_operand: 3 "general_operand" "UplDnm")
(ANY_EXTEND:SVE_HSDI
(unspec:SVE_PARTIAL_I
[(match_operand: 2 "register_operand" "Upl")
(match_operand:SVE_PARTIAL_I 1 "memory_operand" "m")]
UNSPEC_LD1_SVE))]
UNSPEC_PRED_X))]
"TARGET_SVE && (~ & ) == 0"
"ld1\t%0., %2/z, %1"
"&& !CONSTANT_P (operands[3])"
{
operands[3] = CONSTM1_RTX (mode);
}
)
;; -------------------------------------------------------------------------
;; ---- First-faulting contiguous loads
;; -------------------------------------------------------------------------
;; Includes contiguous forms of:
;; - LDFF1B
;; - LDFF1D
;; - LDFF1H
;; - LDFF1W
;; - LDNF1B
;; - LDNF1D
;; - LDNF1H
;; - LDNF1W
;; -------------------------------------------------------------------------
;; Contiguous non-extending first-faulting or non-faulting loads.
(define_insn "@aarch64_ldf1"
[(set (match_operand:SVE_FULL 0 "register_operand" "=w")
(unspec:SVE_FULL
[(match_operand: 2 "register_operand" "Upl")
(match_operand:SVE_FULL 1 "aarch64_sve_ldf1_operand" "Ut")
(reg:VNx16BI FFRT_REGNUM)]
SVE_LDFF1_LDNF1))]
"TARGET_SVE"
"ldf1\t%0., %2/z, %1"
)
;; -------------------------------------------------------------------------
;; ---- First-faulting extending contiguous loads
;; -------------------------------------------------------------------------
;; Includes contiguous forms of:
;; - LDFF1B
;; - LDFF1H
;; - LDFF1SB
;; - LDFF1SH
;; - LDFF1SW
;; - LDFF1W
;; - LDNF1B
;; - LDNF1H
;; - LDNF1SB
;; - LDNF1SH
;; - LDNF1SW
;; - LDNF1W
;; -------------------------------------------------------------------------
;; Predicated first-faulting or non-faulting load and extend.
(define_insn_and_rewrite "@aarch64_ldf1_"
[(set (match_operand:SVE_HSDI 0 "register_operand" "=w")
(unspec:SVE_HSDI
[(match_operand: 3 "general_operand" "UplDnm")
(ANY_EXTEND:SVE_HSDI
(unspec:SVE_PARTIAL_I
[(match_operand: 2 "register_operand" "Upl")
(match_operand:SVE_PARTIAL_I 1 "aarch64_sve_ldf1_operand" "Ut")
(reg:VNx16BI FFRT_REGNUM)]
SVE_LDFF1_LDNF1))]
UNSPEC_PRED_X))]
"TARGET_SVE && (~ & ) == 0"
"ldf1\t%0., %2/z, %1"
"&& !CONSTANT_P (operands[3])"
{
operands[3] = CONSTM1_RTX (mode);
}
)
;; -------------------------------------------------------------------------
;; ---- Non-temporal contiguous loads
;; -------------------------------------------------------------------------
;; Includes:
;; - LDNT1B
;; - LDNT1D
;; - LDNT1H
;; - LDNT1W
;; -------------------------------------------------------------------------
;; Predicated contiguous non-temporal load.
(define_insn "@aarch64_ldnt1"
[(set (match_operand:SVE_FULL 0 "register_operand" "=w")
(unspec:SVE_FULL
[(match_operand: 2 "register_operand" "Upl")
(match_operand:SVE_FULL 1 "memory_operand" "m")]
UNSPEC_LDNT1_SVE))]
"TARGET_SVE"
"ldnt1\t%0., %2/z, %1"
)
;; -------------------------------------------------------------------------
;; ---- Normal gather loads
;; -------------------------------------------------------------------------
;; Includes gather forms of:
;; - LD1D
;; - LD1W
;; -------------------------------------------------------------------------
;; Unpredicated gather loads.
(define_expand "gather_load"
[(set (match_operand:SVE_24 0 "register_operand")
(unspec:SVE_24
[(match_dup 5)
(match_operand:DI 1 "aarch64_sve_gather_offset_")
(match_operand: 2 "register_operand")
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
{
operands[5] = aarch64_ptrue_reg (mode);
}
)
;; Predicated gather loads for 32-bit elements. Operand 3 is true for
;; unsigned extension and false for signed extension.
(define_insn "mask_gather_load"
[(set (match_operand:SVE_4 0 "register_operand" "=w, w, w, w, w, w")
(unspec:SVE_4
[(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
(match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vgw, rk, rk, rk, rk")
(match_operand:VNx4SI 2 "register_operand" "w, w, w, w, w, w")
(match_operand:DI 3 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1")
(match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
"@
ld1\t%0.s, %5/z, [%2.s]
ld1\t%0.s, %5/z, [%2.s, #%1]
ld1\t%0.s, %5/z, [%1, %2.s, sxtw]
ld1\t%0.s, %5/z, [%1, %2.s, uxtw]
ld1\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
ld1\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
)
;; Predicated gather loads for 64-bit elements. The value of operand 3
;; doesn't matter in this case.
(define_insn "mask_gather_load"
[(set (match_operand:SVE_2 0 "register_operand" "=w, w, w, w")
(unspec:SVE_2
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
(match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vgd, rk, rk")
(match_operand:VNx2DI 2 "register_operand" "w, w, w, w")
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
"@
ld1\t%0.d, %5/z, [%2.d]
ld1\t%0.d, %5/z, [%2.d, #%1]
ld1\t%0.d, %5/z, [%1, %2.d]
ld1\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
)
;; Likewise, but with the offset being extended from 32 bits.
(define_insn_and_rewrite "*mask_gather_load_xtw_unpacked"
[(set (match_operand:SVE_2 0 "register_operand" "=w, w")
(unspec:SVE_2
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
(match_operand:DI 1 "register_operand" "rk, rk")
(unspec:VNx2DI
[(match_operand 6)
(ANY_EXTEND:VNx2DI
(match_operand:VNx2SI 2 "register_operand" "w, w"))]
UNSPEC_PRED_X)
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
"@
ld1\t%0.d, %5/z, [%1, %2.d, xtw]
ld1\t%0.d, %5/z, [%1, %2.d, xtw %p4]"
"&& !CONSTANT_P (operands[6])"
{
operands[6] = CONSTM1_RTX (VNx2BImode);
}
)
;; Likewise, but with the offset being truncated to 32 bits and then
;; sign-extended.
(define_insn_and_rewrite "*mask_gather_load_sxtw"
[(set (match_operand:SVE_2 0 "register_operand" "=w, w")
(unspec:SVE_2
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
(match_operand:DI 1 "register_operand" "rk, rk")
(unspec:VNx2DI
[(match_operand 6)
(sign_extend:VNx2DI
(truncate:VNx2SI
(match_operand:VNx2DI 2 "register_operand" "w, w")))]
UNSPEC_PRED_X)
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
"@
ld1\t%0.d, %5/z, [%1, %2.d, sxtw]
ld1\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
"&& !CONSTANT_P (operands[6])"
{
operands[6] = CONSTM1_RTX (VNx2BImode);
}
)
;; Likewise, but with the offset being truncated to 32 bits and then
;; zero-extended.
(define_insn "*mask_gather_load_uxtw"
[(set (match_operand:SVE_2 0 "register_operand" "=w, w")
(unspec:SVE_2
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
(match_operand:DI 1 "register_operand" "rk, rk")
(and:VNx2DI
(match_operand:VNx2DI 2 "register_operand" "w, w")
(match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
"TARGET_SVE"
"@
ld1\t%0.d, %5/z, [%1, %2.d, uxtw]
ld1\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
)
;; -------------------------------------------------------------------------
;; ---- Extending gather loads
;; -------------------------------------------------------------------------
;; Includes gather forms of:
;; - LD1B
;; - LD1H
;; - LD1SB
;; - LD1SH
;; - LD1SW
;; - LD1W
;; -------------------------------------------------------------------------
;; Predicated extending gather loads for 32-bit elements. Operand 3 is
;; true for unsigned extension and false for signed extension.
(define_insn_and_rewrite "@aarch64_gather_load_"
[(set (match_operand:SVE_4HSI 0 "register_operand" "=w, w, w, w, w, w")
(unspec:SVE_4HSI
[(match_operand:VNx4BI 6 "general_operand" "UplDnm, UplDnm, UplDnm, UplDnm, UplDnm, UplDnm")
(ANY_EXTEND:SVE_4HSI
(unspec:SVE_4BHI
[(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
(match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk, rk, rk")
(match_operand:VNx4SI 2 "register_operand" "w, w, w, w, w, w")
(match_operand:DI 3 "const_int_operand" "Ui1, Ui1, Z, Ui1, Z, Ui1")
(match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
UNSPEC_PRED_X))]
"TARGET_SVE && (~ & ) == 0"
"@
ld1\t%0.s, %5/z, [%2.s]
ld1\t%0.s, %5/z, [%2.s, #%1]
ld1\t%0.s, %5/z, [%1, %2.s, sxtw]
ld1\t%0.s, %5/z, [%1, %2.s, uxtw]
ld1\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
ld1\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
"&& !CONSTANT_P (operands[6])"
{
operands[6] = CONSTM1_RTX (VNx4BImode);
}
)
;; Predicated extending gather loads for 64-bit elements. The value of
;; operand 3 doesn't matter in this case.
(define_insn_and_rewrite "@aarch64_gather_load_"
[(set (match_operand:SVE_2HSDI 0 "register_operand" "=w, w, w, w")
(unspec:SVE_2HSDI
[(match_operand:VNx2BI 6 "general_operand" "UplDnm, UplDnm, UplDnm, UplDnm")
(ANY_EXTEND:SVE_2HSDI
(unspec:SVE_2BHSI
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
(match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk")
(match_operand:VNx2DI 2 "register_operand" "w, w, w, w")
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
UNSPEC_PRED_X))]
"TARGET_SVE && (~ & ) == 0"
"@
ld1\t%0.d, %5/z, [%2.d]
ld1\t%0.d, %5/z, [%2.d, #%1]
ld1\t%0.d, %5/z, [%1, %2.d]
ld1\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
"&& !CONSTANT_P (operands[6])"
{
operands[6] = CONSTM1_RTX (VNx2BImode);
}
)
;; Likewise, but with the offset being extended from 32 bits.
(define_insn_and_rewrite "*aarch64_gather_load__xtw_unpacked"
[(set (match_operand:SVE_2HSDI 0 "register_operand" "=w, w")
(unspec:SVE_2HSDI
[(match_operand 6)
(ANY_EXTEND:SVE_2HSDI
(unspec:SVE_2BHSI
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
(match_operand:DI 1 "aarch64_reg_or_zero" "rk, rk")
(unspec:VNx2DI
[(match_operand 7)
(ANY_EXTEND2:VNx2DI
(match_operand:VNx2SI 2 "register_operand" "w, w"))]
UNSPEC_PRED_X)
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
UNSPEC_PRED_X))]
"TARGET_SVE && (~ & ) == 0"
"@
ld1\t%0.d, %5/z, [%1, %2.d, xtw]
ld1\t%0.d, %5/z, [%1, %2.d, xtw %p4]"
"&& (!CONSTANT_P (operands[6]) || !CONSTANT_P (operands[7]))"
{
operands[6] = CONSTM1_RTX (VNx2BImode);
operands[7] = CONSTM1_RTX (VNx2BImode);
}
)
;; Likewise, but with the offset being truncated to 32 bits and then
;; sign-extended.
(define_insn_and_rewrite "*aarch64_gather_load__sxtw"
[(set (match_operand:SVE_2HSDI 0 "register_operand" "=w, w")
(unspec:SVE_2HSDI
[(match_operand 6)
(ANY_EXTEND:SVE_2HSDI
(unspec:SVE_2BHSI
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
(match_operand:DI 1 "aarch64_reg_or_zero" "rk, rk")
(unspec:VNx2DI
[(match_operand 7)
(sign_extend:VNx2DI
(truncate:VNx2SI
(match_operand:VNx2DI 2 "register_operand" "w, w")))]
UNSPEC_PRED_X)
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
UNSPEC_PRED_X))]
"TARGET_SVE && (~ & ) == 0"
"@
ld1\t%0.d, %5/z, [%1, %2.d, sxtw]
ld1\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
"&& (!CONSTANT_P (operands[6]) || !CONSTANT_P (operands[7]))"
{
operands[6] = CONSTM1_RTX (VNx2BImode);
operands[7] = CONSTM1_RTX (VNx2BImode);
}
)
;; Likewise, but with the offset being truncated to 32 bits and then
;; zero-extended.
(define_insn_and_rewrite "*aarch64_gather_load__uxtw"
[(set (match_operand:SVE_2HSDI 0 "register_operand" "=w, w")
(unspec:SVE_2HSDI
[(match_operand 7)
(ANY_EXTEND:SVE_2HSDI
(unspec:SVE_2BHSI
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
(match_operand:DI 1 "aarch64_reg_or_zero" "rk, rk")
(and:VNx2DI
(match_operand:VNx2DI 2 "register_operand" "w, w")
(match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, i")
(mem:BLK (scratch))]
UNSPEC_LD1_GATHER))]
UNSPEC_PRED_X))]
"TARGET_SVE && (~ & ) == 0"
"@
ld1\t%0.d, %5/z, [%1, %2.d, uxtw]
ld1\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
"&& !CONSTANT_P (operands[7])"
{
operands[7] = CONSTM1_RTX (VNx2BImode);
}
)
;; -------------------------------------------------------------------------
;; ---- First-faulting gather loads
;; -------------------------------------------------------------------------
;; Includes gather forms of:
;; - LDFF1D
;; - LDFF1W
;; -------------------------------------------------------------------------
;; Predicated first-faulting gather loads for 32-bit elements. Operand
;; 3 is true for unsigned extension and false for signed extension.
(define_insn "@aarch64_ldff1_gather"
[(set (match_operand:SVE_FULL_S 0 "register_operand" "=w, w, w, w, w, w")
(unspec:SVE_FULL_S
[(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
(match_operand:DI 1 "aarch64_sve_gather_offset_w" "Z, vgw, rk, rk, rk, rk")
(match_operand:VNx4SI 2 "register_operand" "w, w, w, w, w, w")
(match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1")
(match_operand:DI 4 "aarch64_gather_scale_operand_w" "Ui1, Ui1, Ui1, Ui1, i, i")
(mem:BLK (scratch))
(reg:VNx16BI FFRT_REGNUM)]
UNSPEC_LDFF1_GATHER))]
"TARGET_SVE"
"@
ldff1w\t%0.s, %5/z, [%2.s]
ldff1w\t%0.s, %5/z, [%2.s, #%1]
ldff1w\t%0.s, %5/z, [%1, %2.s, sxtw]
ldff1w\t%0.s, %5/z, [%1, %2.s, uxtw]
ldff1w\t%0.s, %5/z, [%1, %2.s, sxtw %p4]
ldff1w\t%0.s, %5/z, [%1, %2.s, uxtw %p4]"
)
;; Predicated first-faulting gather loads for 64-bit elements. The value
;; of operand 3 doesn't matter in this case.
(define_insn "@aarch64_ldff1_gather"
[(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w, w, w")
(unspec:SVE_FULL_D
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl, Upl, Upl")
(match_operand:DI 1 "aarch64_sve_gather_offset_d" "Z, vgd, rk, rk")
(match_operand:VNx2DI 2 "register_operand" "w, w, w, w")
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, Ui1, Ui1, i")
(mem:BLK (scratch))
(reg:VNx16BI FFRT_REGNUM)]
UNSPEC_LDFF1_GATHER))]
"TARGET_SVE"
"@
ldff1d\t%0.d, %5/z, [%2.d]
ldff1d\t%0.d, %5/z, [%2.d, #%1]
ldff1d\t%0.d, %5/z, [%1, %2.d]
ldff1d\t%0.d, %5/z, [%1, %2.d, lsl %p4]"
)
;; Likewise, but with the offset being sign-extended from 32 bits.
(define_insn_and_rewrite "*aarch64_ldff1_gather_sxtw"
[(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
(unspec:SVE_FULL_D
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
(match_operand:DI 1 "register_operand" "rk, rk")
(unspec:VNx2DI
[(match_operand 6)
(sign_extend:VNx2DI
(truncate:VNx2SI
(match_operand:VNx2DI 2 "register_operand" "w, w")))]
UNSPEC_PRED_X)
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
(mem:BLK (scratch))
(reg:VNx16BI FFRT_REGNUM)]
UNSPEC_LDFF1_GATHER))]
"TARGET_SVE"
"@
ldff1d\t%0.d, %5/z, [%1, %2.d, sxtw]
ldff1d\t%0.d, %5/z, [%1, %2.d, sxtw %p4]"
"&& !CONSTANT_P (operands[6])"
{
operands[6] = CONSTM1_RTX (VNx2BImode);
}
)
;; Likewise, but with the offset being zero-extended from 32 bits.
(define_insn "*aarch64_ldff1_gather_uxtw"
[(set (match_operand:SVE_FULL_D 0 "register_operand" "=w, w")
(unspec:SVE_FULL_D
[(match_operand:VNx2BI 5 "register_operand" "Upl, Upl")
(match_operand:DI 1 "register_operand" "rk, rk")
(and:VNx2DI
(match_operand:VNx2DI 2 "register_operand" "w, w")
(match_operand:VNx2DI 6 "aarch64_sve_uxtw_immediate"))
(match_operand:DI 3 "const_int_operand")
(match_operand:DI 4 "aarch64_gather_scale_operand_d" "Ui1, i")
(mem:BLK (scratch))
(reg:VNx16BI FFRT_REGNUM)]
UNSPEC_LDFF1_GATHER))]
"TARGET_SVE"
"@
ldff1d\t%0.d, %5/z, [%1, %2.d, uxtw]
ldff1d\t%0.d, %5/z, [%1, %2.d, uxtw %p4]"
)
;; -------------------------------------------------------------------------
;; ---- First-faulting extending gather loads
;; -------------------------------------------------------------------------
;; Includes gather forms of:
;; - LDFF1B
;; - LDFF1H
;; - LDFF1SB
;; - LDFF1SH
;; - LDFF1SW
;; - LDFF1W
;; -------------------------------------------------------------------------
;; Predicated extending first-faulting gather loads for 32-bit elements.
;; Operand 3 is true for unsigned extension and false for signed extension.
(define_insn_and_rewrite "@aarch64_ldff1_gather_"
[(set (match_operand:VNx4_WIDE 0 "register_operand" "=w, w, w, w, w, w")
(unspec:VNx4_WIDE
[(match_operand:VNx4BI 6 "general_operand" "UplDnm, UplDnm, UplDnm, UplDnm, UplDnm, UplDnm")
(ANY_EXTEND:VNx4_WIDE
(unspec:VNx4_NARROW
[(match_operand:VNx4BI 5 "register_operand" "Upl, Upl, Upl, Upl, Upl, Upl")
(match_operand:DI 1 "aarch64_sve_gather_offset_" "Z, vg, rk, rk, rk, rk")
(match_operand:VNx4_WIDE 2 "register_operand" "w, w, w, w, w, w")
(match_operand:DI 3 "const_int_operand" "i, i, Z, Ui1, Z, Ui1")
(match_operand:DI 4 "aarch64_gather_scale_operand_" "Ui1, Ui1, Ui1, Ui1, i, i")
(mem:BLK (scratch))
(reg:VNx16BI FFRT_REGNUM)]
UNSPEC_LDFF1_GATHER))]
UNSPEC_PRED_X))]
"TARGET_SVE"
"@
ldff1\t%0.s, %5/z, [%2.s]
ldff1\t%0.s, %5/z, [%2.s, #%1]
ldff1\t%0.s, %5/z, [%1, %2.s, sxtw]
ldff1