Skip to content

Commit a304d0f

Browse files
author
duke
committed
Backport b6732d6
1 parent ec6beaa commit a304d0f

File tree

8 files changed

+619
-56
lines changed

8 files changed

+619
-56
lines changed

src/hotspot/cpu/aarch64/aarch64_vector.ad

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -346,8 +346,14 @@ source %{
346346
}
347347

348348
bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
349-
// Only SVE has partial vector operations
350-
if (UseSVE == 0) {
349+
// 1. Only SVE requires partial vector operations.
350+
// 2. The vector size in bytes must be smaller than MaxVectorSize.
351+
// 3. Predicated vectors have a mask input, which guarantees that
352+
// out-of-bounds lanes remain inactive.
353+
int length_in_bytes = vt->length_in_bytes();
354+
if (UseSVE == 0 ||
355+
length_in_bytes == MaxVectorSize ||
356+
node->is_predicated_vector()) {
351357
return false;
352358
}
353359

@@ -370,21 +376,22 @@ source %{
370376
return !node->in(1)->is_Con();
371377
case Op_LoadVector:
372378
case Op_StoreVector:
373-
// We use NEON load/store instructions if the vector length is <= 128 bits.
374-
return vt->length_in_bytes() > 16;
375379
case Op_AddReductionVI:
376380
case Op_AddReductionVL:
377-
// We may prefer using NEON instructions rather than SVE partial operations.
378-
return !VM_Version::use_neon_for_vector(vt->length_in_bytes());
381+
// For these ops, we prefer using NEON instructions rather than SVE
382+
// predicated instructions for better performance.
383+
return !VM_Version::use_neon_for_vector(length_in_bytes);
379384
case Op_MinReductionV:
380385
case Op_MaxReductionV:
381-
// For BYTE/SHORT/INT/FLOAT/DOUBLE types, we may prefer using NEON
382-
// instructions rather than SVE partial operations.
386+
// For BYTE/SHORT/INT/FLOAT/DOUBLE types, we prefer using NEON
387+
// instructions rather than SVE predicated instructions for
388+
// better performance.
383389
return vt->element_basic_type() == T_LONG ||
384-
!VM_Version::use_neon_for_vector(vt->length_in_bytes());
390+
!VM_Version::use_neon_for_vector(length_in_bytes);
385391
default:
386-
// For other ops whose vector size is smaller than the max vector size, a
387-
// full-sized unpredicated operation does not impact the final vector result.
392+
// For other ops whose vector size is smaller than the max vector
393+
// size, a full-sized unpredicated operation does not impact the
394+
// vector result.
388395
return false;
389396
}
390397
}

src/hotspot/cpu/aarch64/aarch64_vector_ad.m4

Lines changed: 18 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -336,8 +336,14 @@ source %{
336336
}
337337

338338
bool Matcher::vector_needs_partial_operations(Node* node, const TypeVect* vt) {
339-
// Only SVE has partial vector operations
340-
if (UseSVE == 0) {
339+
// 1. Only SVE requires partial vector operations.
340+
// 2. The vector size in bytes must be smaller than MaxVectorSize.
341+
// 3. Predicated vectors have a mask input, which guarantees that
342+
// out-of-bounds lanes remain inactive.
343+
int length_in_bytes = vt->length_in_bytes();
344+
if (UseSVE == 0 ||
345+
length_in_bytes == MaxVectorSize ||
346+
node->is_predicated_vector()) {
341347
return false;
342348
}
343349

@@ -360,21 +366,22 @@ source %{
360366
return !node->in(1)->is_Con();
361367
case Op_LoadVector:
362368
case Op_StoreVector:
363-
// We use NEON load/store instructions if the vector length is <= 128 bits.
364-
return vt->length_in_bytes() > 16;
365369
case Op_AddReductionVI:
366370
case Op_AddReductionVL:
367-
// We may prefer using NEON instructions rather than SVE partial operations.
368-
return !VM_Version::use_neon_for_vector(vt->length_in_bytes());
371+
// For these ops, we prefer using NEON instructions rather than SVE
372+
// predicated instructions for better performance.
373+
return !VM_Version::use_neon_for_vector(length_in_bytes);
369374
case Op_MinReductionV:
370375
case Op_MaxReductionV:
371-
// For BYTE/SHORT/INT/FLOAT/DOUBLE types, we may prefer using NEON
372-
// instructions rather than SVE partial operations.
376+
// For BYTE/SHORT/INT/FLOAT/DOUBLE types, we prefer using NEON
377+
// instructions rather than SVE predicated instructions for
378+
// better performance.
373379
return vt->element_basic_type() == T_LONG ||
374-
!VM_Version::use_neon_for_vector(vt->length_in_bytes());
380+
!VM_Version::use_neon_for_vector(length_in_bytes);
375381
default:
376-
// For other ops whose vector size is smaller than the max vector size, a
377-
// full-sized unpredicated operation does not impact the final vector result.
382+
// For other ops whose vector size is smaller than the max vector
383+
// size, a full-sized unpredicated operation does not impact the
384+
// vector result.
378385
return false;
379386
}
380387
}

src/hotspot/share/opto/matcher.hpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -329,6 +329,10 @@ class Matcher : public PhaseTransform {
329329

330330
static bool match_rule_supported_vector_masked(int opcode, int vlen, BasicType bt);
331331

332+
// Determines if a vector operation needs to be partially implemented with a mask
333+
// controlling only the lanes in range [0, vector_length) are processed. This applies
334+
// to operations whose vector length is less than the hardware-supported maximum
335+
// vector length. Returns true if the operation requires masking, false otherwise.
332336
static bool vector_needs_partial_operations(Node* node, const TypeVect* vt);
333337

334338
static bool vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen);

src/hotspot/share/opto/vectornode.cpp

Lines changed: 32 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -936,28 +936,26 @@ bool VectorNode::is_scalar_op_that_returns_int_but_vector_op_returns_long(int op
936936
}
937937
}
938938

939+
// Idealize vector operations whose vector size is less than the hardware supported
940+
// max vector size. Generate a vector mask for the operation. Lanes with indices
941+
// inside of the vector size are set to true, while the remaining lanes are set to
942+
// false. Returns the corresponding masked vector node.
943+
static Node* ideal_partial_operations(PhaseGVN* phase, Node* node, const TypeVect* vt) {
944+
if (!Matcher::vector_needs_partial_operations(node, vt)) {
945+
return nullptr;
946+
}
939947

940-
Node* VectorNode::try_to_gen_masked_vector(PhaseGVN* gvn, Node* node, const TypeVect* vt) {
941948
int vopc = node->Opcode();
942949
uint vlen = vt->length();
943950
BasicType bt = vt->element_basic_type();
951+
assert(Matcher::match_rule_supported_vector_masked(vopc, vlen, bt),
952+
"The masked feature is required for the vector operation");
953+
assert(Matcher::match_rule_supported_vector(Op_VectorMaskGen, vlen, bt),
954+
"'VectorMaskGen' is required to generate a vector mask");
944955

945-
// Predicated vectors do not need to add another mask input
946-
if (node->is_predicated_vector() || !Matcher::has_predicated_vectors() ||
947-
!Matcher::match_rule_supported_vector_masked(vopc, vlen, bt) ||
948-
!Matcher::match_rule_supported_vector(Op_VectorMaskGen, vlen, bt)) {
949-
return nullptr;
950-
}
951-
952-
Node* mask = nullptr;
953-
// Generate a vector mask for vector operation whose vector length is lower than the
954-
// hardware supported max vector length.
955-
if (vt->length_in_bytes() < (uint)MaxVectorSize) {
956-
Node* length = gvn->transform(new ConvI2LNode(gvn->makecon(TypeInt::make(vlen))));
957-
mask = gvn->transform(VectorMaskGenNode::make(length, bt, vlen));
958-
} else {
959-
return nullptr;
960-
}
956+
// Generate a vector mask, with lanes inside of the vector length set to true.
957+
Node* length = phase->transform(new ConvI2LNode(phase->makecon(TypeInt::make(vlen))));
958+
Node* mask = phase->transform(VectorMaskGenNode::make(length, bt, vlen));
961959

962960
// Generate the related masked op for vector load/store/load_gather/store_scatter.
963961
// Or append the mask to the vector op's input list by default.
@@ -1037,8 +1035,9 @@ bool VectorNode::should_swap_inputs_to_help_global_value_numbering() {
10371035
}
10381036

10391037
Node* VectorNode::Ideal(PhaseGVN* phase, bool can_reshape) {
1040-
if (Matcher::vector_needs_partial_operations(this, vect_type())) {
1041-
return try_to_gen_masked_vector(phase, this, vect_type());
1038+
Node* n = ideal_partial_operations(phase, this, vect_type());
1039+
if (n != nullptr) {
1040+
return n;
10421041
}
10431042

10441043
// Sort inputs of commutative non-predicated vector operations to help value numbering.
@@ -1119,9 +1118,9 @@ LoadVectorNode* LoadVectorNode::make(int opc, Node* ctl, Node* mem,
11191118
}
11201119

11211120
Node* LoadVectorNode::Ideal(PhaseGVN* phase, bool can_reshape) {
1122-
const TypeVect* vt = vect_type();
1123-
if (Matcher::vector_needs_partial_operations(this, vt)) {
1124-
return VectorNode::try_to_gen_masked_vector(phase, this, vt);
1121+
Node* n = ideal_partial_operations(phase, this, vect_type());
1122+
if (n != nullptr) {
1123+
return n;
11251124
}
11261125
return LoadNode::Ideal(phase, can_reshape);
11271126
}
@@ -1133,9 +1132,9 @@ StoreVectorNode* StoreVectorNode::make(int opc, Node* ctl, Node* mem, Node* adr,
11331132
}
11341133

11351134
Node* StoreVectorNode::Ideal(PhaseGVN* phase, bool can_reshape) {
1136-
const TypeVect* vt = vect_type();
1137-
if (Matcher::vector_needs_partial_operations(this, vt)) {
1138-
return VectorNode::try_to_gen_masked_vector(phase, this, vt);
1135+
Node* n = ideal_partial_operations(phase, this, vect_type());
1136+
if (n != nullptr) {
1137+
return n;
11391138
}
11401139
return StoreNode::Ideal(phase, can_reshape);
11411140
}
@@ -1411,11 +1410,11 @@ ReductionNode* ReductionNode::make(int opc, Node* ctrl, Node* n1, Node* n2, Basi
14111410
}
14121411

14131412
Node* ReductionNode::Ideal(PhaseGVN* phase, bool can_reshape) {
1414-
const TypeVect* vt = vect_type();
1415-
if (Matcher::vector_needs_partial_operations(this, vt)) {
1416-
return VectorNode::try_to_gen_masked_vector(phase, this, vt);
1413+
Node* n = ideal_partial_operations(phase, this, vect_type());
1414+
if (n != nullptr) {
1415+
return n;
14171416
}
1418-
return nullptr;
1417+
return Node::Ideal(phase, can_reshape);
14191418
}
14201419

14211420
// Convert fromLong to maskAll if the input sets or unsets all lanes.
@@ -1893,11 +1892,11 @@ Node* VectorMaskOpNode::make(Node* mask, const Type* ty, int mopc) {
18931892
}
18941893

18951894
Node* VectorMaskOpNode::Ideal(PhaseGVN* phase, bool can_reshape) {
1896-
const TypeVect* vt = vect_type();
1897-
if (Matcher::vector_needs_partial_operations(this, vt)) {
1898-
return VectorNode::try_to_gen_masked_vector(phase, this, vt);
1895+
Node* n = ideal_partial_operations(phase, this, vect_type());
1896+
if (n != nullptr) {
1897+
return n;
18991898
}
1900-
return nullptr;
1899+
return TypeNode::Ideal(phase, can_reshape);
19011900
}
19021901

19031902
Node* VectorMaskCastNode::Identity(PhaseGVN* phase) {

src/hotspot/share/opto/vectornode.hpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,7 +117,6 @@ class VectorNode : public TypeNode {
117117
static bool is_vector_bitwise_not_pattern(Node* n);
118118
static Node* degenerate_vector_rotate(Node* n1, Node* n2, bool is_rotate_left, int vlen,
119119
BasicType bt, PhaseGVN* phase);
120-
static Node* try_to_gen_masked_vector(PhaseGVN* gvn, Node* node, const TypeVect* vt);
121120

122121
// [Start, end) half-open range defining which operands are vectors
123122
static void vector_operands(Node* n, uint* start, uint* end);

test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1448,6 +1448,16 @@ public class IRNode {
14481448
beforeMatchingNameRegex(VECTOR_MASK_LANE_IS_SET, "ExtractUB");
14491449
}
14501450

1451+
public static final String VECTOR_MASK_GEN = PREFIX + "VECTOR_MASK_GEN" + POSTFIX;
1452+
static {
1453+
beforeMatchingNameRegex(VECTOR_MASK_GEN, "VectorMaskGen");
1454+
}
1455+
1456+
public static final String VECTOR_MASK_FIRST_TRUE = PREFIX + "VECTOR_MASK_FIRST_TRUE" + POSTFIX;
1457+
static {
1458+
beforeMatchingNameRegex(VECTOR_MASK_FIRST_TRUE, "VectorMaskFirstTrue");
1459+
}
1460+
14511461
// Can only be used if avx512_vnni is available.
14521462
public static final String MUL_ADD_VS2VI_VNNI = PREFIX + "MUL_ADD_VS2VI_VNNI" + POSTFIX;
14531463
static {
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
24+
package compiler.vectorapi;
25+
26+
import compiler.lib.generators.*;
27+
import compiler.lib.ir_framework.*;
28+
import jdk.incubator.vector.*;
29+
import jdk.test.lib.Asserts;
30+
31+
/**
32+
* @test 8371603
33+
* @key randomness
34+
* @library /test/lib /
35+
* @summary Test the missing optimization issues for vector load/store caused by JDK-8286941
36+
* @modules jdk.incubator.vector
37+
*
38+
* @run driver ${test.main.class}
39+
*/
40+
public class TestVectorLoadStoreOptimization {
41+
private static final int LENGTH = 1024;
42+
private static final Generators random = Generators.G;
43+
44+
private static final VectorSpecies<Integer> SPECIES = IntVector.SPECIES_PREFERRED;
45+
46+
private static int[] a;
47+
48+
static {
49+
a = new int[LENGTH];
50+
random.fill(random.ints(), a);
51+
}
52+
53+
// Test that "LoadVectorNode::Ideal()" calls "LoadNode::Ideal()" as expected,
54+
// which sees the previous stores that go to the same position in-dependently,
55+
// and optimize out the load with matched store values.
56+
@Test
57+
@IR(counts = { IRNode.LOAD_VECTOR_I, "1" },
58+
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true", "rvv", "true"})
59+
public static void testLoadVector() {
60+
IntVector v1 = IntVector.fromArray(SPECIES, a, 0);
61+
v1.intoArray(a, SPECIES.length());
62+
v1.intoArray(a, 2 * SPECIES.length());
63+
// The second load vector equals to the first one and should be optimized
64+
// out by "LoadNode::Ideal()".
65+
IntVector v2 = IntVector.fromArray(SPECIES, a, SPECIES.length());
66+
v2.intoArray(a, 3 * SPECIES.length());
67+
}
68+
69+
@Check(test = "testLoadVector")
70+
public static void testLoadVectorVerify() {
71+
for (int i = SPECIES.length(); i < 4 * SPECIES.length(); i += SPECIES.length()) {
72+
for (int j = 0; j < SPECIES.length(); j++) {
73+
Asserts.assertEquals(a[i + j], a[j]);
74+
}
75+
}
76+
}
77+
78+
// Test that "StoreVectorNode::Ideal()" calls "StoreNode::Ideal()" as expected,
79+
// which can get rid of previous stores that go to the same position.
80+
@Test
81+
@IR(counts = { IRNode.STORE_VECTOR, "1" },
82+
applyIfCPUFeatureOr = {"asimd", "true", "avx", "true", "rvv", "true"})
83+
public static void testStoreVector() {
84+
IntVector v1 = IntVector.fromArray(SPECIES, a, 0 * SPECIES.length());
85+
IntVector v2 = IntVector.fromArray(SPECIES, a, 1 * SPECIES.length());
86+
// Useless store to same position as below, which should be optimized out by
87+
// "StoreNode::Ideal()".
88+
v1.intoArray(a, 3 * SPECIES.length());
89+
v2.intoArray(a, 3 * SPECIES.length());
90+
}
91+
92+
@Check(test = "testStoreVector")
93+
public static void testStoreVectorVerify() {
94+
for (int i = 3 * SPECIES.length(); i < 4 * SPECIES.length(); i++) {
95+
Asserts.assertEquals(a[i], a[i - 2 * SPECIES.length()]);
96+
}
97+
}
98+
99+
public static void main(String[] args) {
100+
TestFramework testFramework = new TestFramework();
101+
testFramework.setDefaultWarmup(10000)
102+
.addFlags("--add-modules=jdk.incubator.vector")
103+
.start();
104+
}
105+
}

0 commit comments

Comments
 (0)