Skip to content

Commit d08f97d

Browse files
ggreenwaydcillera
authored andcommitted
conn pool: fix bugs leading to incorrect conns created (#39446)
Backport #39310 and #39349 If a connection starts draining while it has negative unused capacity (which happens if a SETTINGS frame reduces allowed concurrency to below the current number of requests), that connections unused capacity will be included in total pool capacity even though it is unusable because it is draining. This can result in not enough connections being established for current pending requests. This is most problematic for long-lived requests (such as streaming gRPC requests or long-poll requests) because a connection could be in the draining state for a long time. Maybe fixes: #39238 Fixed an issue that could lead to too many connections when using :ref:`AutoHttpConfig <envoy_v3_api_msg_extensions.upstreams.http.v3.HttpProtocolOptions.AutoHttpConfig>` if the established connection is ``http/2`` and Envoy predicted it would have lower concurrent capacity. --------- Signed-off-by: Greg Greenway <[email protected]> Signed-off-by: Dario Cillerai <[email protected]>
1 parent 8bd888d commit d08f97d

File tree

5 files changed

+129
-13
lines changed

5 files changed

+129
-13
lines changed

changelogs/current.yaml

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,19 @@ minor_behavior_changes:
88

99
bug_fixes:
1010
# *Changes expected to improve the state of the world and are unlikely to have negative effects*
11-
- area: eds
11+
- area: conn_pool
1212
change: |
13-
Fixed crash when creating an EDS cluster with invalid configuration.
14-
- area: url_template
13+
Fixed an issue that could lead to too many connections when using
14+
:ref:`AutoHttpConfig <envoy_v3_api_msg_extensions.upstreams.http.v3.HttpProtocolOptions.AutoHttpConfig>` if the
15+
established connection is ``http/2`` and Envoy predicted it would have lower concurrent capacity.
16+
- area: conn_pool
1517
change: |
16-
Included the asterisk ``*`` in the match pattern when using the * or ** operators in the URL template.
17-
This behavioral change can be temporarily reverted by setting runtime guard
18-
``envoy.reloadable_features.uri_template_match_on_asterisk`` to ``false``.
18+
Fixed an issue that could lead to insufficient connections for current pending requests. If a connection starts draining while it
19+
has negative unused capacity (which happens if an HTTP/2 ``SETTINGS`` frame reduces allowed concurrency to below the current number
20+
of requests), that connection's unused capacity will be included in total pool capacity even though it is unusable because it is
21+
draining. This can result in not enough connections being established for current pending requests. This is most problematic for
22+
long-lived requests (such as streaming gRPC requests or long-poll requests) because a connection could be in the draining state
23+
for a long time.
1924
2025
removed_config_or_runtime:
2126
# *Normally occurs at the end of the* :ref:`deprecation period <deprecated>`

source/common/conn_pool/conn_pool_base.cc

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,7 @@ void ConnPoolImplBase::onStreamClosed(Envoy::ConnectionPool::ActiveClient& clien
261261
// We don't update the capacity for HTTP/3 as the stream count should only
262262
// increase when a MAX_STREAMS frame is received.
263263
if (trackStreamCapacity()) {
264-
// If the effective client capacity was limited by concurrency, increase connecting capacity.
264+
// If the effective client capacity was limited by concurrency, increase connected capacity.
265265
bool limited_by_concurrency =
266266
client.remaining_streams_ > client.concurrent_stream_limit_ - client.numActiveStreams() - 1;
267267
// The capacity calculated by concurrency could be negative if a SETTINGS frame lowered the
@@ -878,12 +878,20 @@ void ActiveClient::onConnectionDurationTimeout() {
878878
}
879879

880880
void ActiveClient::drain() {
881-
if (currentUnusedCapacity() <= 0) {
882-
return;
883-
}
884-
parent_.decrConnectingAndConnectedStreamCapacity(currentUnusedCapacity(), *this);
881+
const int64_t unused = currentUnusedCapacity();
885882

883+
// Remove draining client's capacity from the pool.
884+
//
885+
// The code that adds capacity back to the pool in `onStreamClosed` will only add it back if
886+
// it sees the connection as currently limited by concurrent capacity, not total lifetime streams.
887+
// Setting this to zero ensures that the `limited_by_concurrency` check does not detect this
888+
// connection as limited for that reason, because it is now being marked as having zero remaining
889+
// lifetime requests.
886890
remaining_streams_ = 0;
891+
892+
if (unused > 0) {
893+
parent_.decrConnectingAndConnectedStreamCapacity(unused, *this);
894+
}
887895
}
888896

889897
} // namespace ConnectionPool

source/common/http/mixed_conn_pool.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,8 @@ void HttpConnPoolImplMixed::onConnected(Envoy::ConnectionPool::ActiveClient& cli
8686
// it to reflect any difference between the TCP stream limits and HTTP/2
8787
// stream limits.
8888
if (new_client->effectiveConcurrentStreamLimit() > old_effective_limit) {
89-
state_.incrConnectingAndConnectedStreamCapacity(new_client->effectiveConcurrentStreamLimit() -
90-
old_effective_limit);
89+
incrConnectingAndConnectedStreamCapacity(
90+
new_client->effectiveConcurrentStreamLimit() - old_effective_limit, client);
9191
}
9292
new_client->setState(ActiveClient::State::Connecting);
9393
LinkedList::moveIntoList(std::move(new_client), owningList(new_client->state()));

test/common/http/http2/conn_pool_test.cc

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1805,6 +1805,97 @@ TEST_F(Http2ConnPoolImplTest, TestUnusedCapacity) {
18051805
CHECK_STATE(0 /*active*/, 0 /*pending*/, 0 /*capacity*/);
18061806
}
18071807

1808+
TEST_F(Http2ConnPoolImplTest, TestNegativeUnusedCapacity) {
1809+
cluster_->http2_options_.mutable_max_concurrent_streams()->set_value(4);
1810+
1811+
expectClientsCreate(2);
1812+
ActiveTestRequest r1(*this, 0, false);
1813+
CHECK_STATE(0 /*active*/, 1 /*pending*/, 4 /*capacity*/);
1814+
expectClientConnect(0, r1);
1815+
CHECK_STATE(1 /*active*/, 0 /*pending*/, 3 /*capacity*/);
1816+
1817+
ActiveTestRequest r2(*this, 0, true);
1818+
ActiveTestRequest r3(*this, 0, true);
1819+
CHECK_STATE(3 /*active*/, 0 /*pending*/, 1 /*capacity*/);
1820+
1821+
// Settings frame results in negative unused capacity.
1822+
NiceMock<MockReceivedSettings> settings;
1823+
settings.max_concurrent_streams_ = 1;
1824+
test_clients_[0].codec_client_->onSettings(settings);
1825+
CHECK_STATE(3 /*active*/, 0 /*pending*/, -2 /*capacity*/);
1826+
1827+
completeRequest(r1);
1828+
CHECK_STATE(2 /*active*/, 0 /*pending*/, -1 /*capacity*/);
1829+
1830+
// With negative capacity, verify that a new request creates a new connection.
1831+
ActiveTestRequest r4(*this, 1, false);
1832+
CHECK_STATE(2 /*active*/, 1 /*pending*/, 3 /*capacity*/);
1833+
expectClientConnect(1, r4);
1834+
CHECK_STATE(3 /*active*/, 0 /*pending*/, 2 /*capacity*/);
1835+
1836+
completeRequest(r2);
1837+
CHECK_STATE(2 /*active*/, 0 /*pending*/, 3 /*capacity*/);
1838+
1839+
completeRequest(r3);
1840+
CHECK_STATE(1 /*active*/, 0 /*pending*/, 4 /*capacity*/);
1841+
1842+
completeRequest(r4);
1843+
CHECK_STATE(0 /*active*/, 0 /*pending*/, 5 /*capacity*/);
1844+
1845+
// Clean up with an outstanding stream.
1846+
pool_->drainConnections(Envoy::ConnectionPool::DrainBehavior::DrainExistingConnections);
1847+
closeAllClients();
1848+
CHECK_STATE(0 /*active*/, 0 /*pending*/, 0 /*capacity*/);
1849+
}
1850+
1851+
TEST_F(Http2ConnPoolImplTest, TestDrainingNegativeUnusedCapacity) {
1852+
cluster_->http2_options_.mutable_max_concurrent_streams()->set_value(4);
1853+
1854+
expectClientsCreate(2);
1855+
ActiveTestRequest r1(*this, 0, false);
1856+
CHECK_STATE(0 /*active*/, 1 /*pending*/, 4 /*capacity*/);
1857+
expectClientConnect(0, r1);
1858+
CHECK_STATE(1 /*active*/, 0 /*pending*/, 3 /*capacity*/);
1859+
1860+
ActiveTestRequest r2(*this, 0, true);
1861+
ActiveTestRequest r3(*this, 0, true);
1862+
CHECK_STATE(3 /*active*/, 0 /*pending*/, 1 /*capacity*/);
1863+
1864+
// Settings frame results in negative unused capacity.
1865+
NiceMock<MockReceivedSettings> settings;
1866+
settings.max_concurrent_streams_ = 2;
1867+
test_clients_[0].codec_client_->onSettings(settings);
1868+
CHECK_STATE(3 /*active*/, 0 /*pending*/, -1 /*capacity*/);
1869+
1870+
// Clean up with an outstanding stream.
1871+
pool_->drainConnections(Envoy::ConnectionPool::DrainBehavior::DrainExistingConnections);
1872+
1873+
completeRequest(r1);
1874+
CHECK_STATE(2 /*active*/, 0 /*pending*/, 0 /*capacity*/);
1875+
1876+
// With negative capacity, verify that a new request creates a new connection.
1877+
ActiveTestRequest r4(*this, 1, false);
1878+
CHECK_STATE(2 /*active*/, 1 /*pending*/, 4 /*capacity*/);
1879+
expectClientConnect(1, r4);
1880+
CHECK_STATE(3 /*active*/, 0 /*pending*/, 3 /*capacity*/);
1881+
1882+
// Completing this request does NOT add more capacity because the connection is
1883+
// draining, and that connection's capacity is no longer negative.
1884+
completeRequest(r2);
1885+
CHECK_STATE(2 /*active*/, 0 /*pending*/, 3 /*capacity*/);
1886+
1887+
// After r3 completes, the draining connection should close.
1888+
EXPECT_CALL(*this, onClientDestroy());
1889+
completeRequest(r3);
1890+
dispatcher_.clearDeferredDeleteList();
1891+
CHECK_STATE(1 /*active*/, 0 /*pending*/, 3 /*capacity*/);
1892+
1893+
completeRequest(r4);
1894+
CHECK_STATE(0 /*active*/, 0 /*pending*/, 4 /*capacity*/);
1895+
1896+
closeClient(1);
1897+
}
1898+
18081899
TEST_F(Http2ConnPoolImplTest, TestStateWithMultiplexing) {
18091900
cluster_->http2_options_.mutable_max_concurrent_streams()->set_value(2);
18101901
cluster_->max_requests_per_connection_ = 4;

test/common/http/mixed_conn_pool_test.cc

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,6 +114,18 @@ TEST_F(MixedConnPoolImplTest, HandshakeWithCachedLimit) {
114114
testAlpnHandshake({});
115115
}
116116

117+
// Test that increasing the limit upon connect, versus what was in the cache before connection,
118+
// works correctly.
119+
TEST_F(MixedConnPoolImplTest, HandshakeWithCachedLimitAndEffectiveIncrease) {
120+
expected_capacity_ = 1;
121+
122+
// This simulates a previous connection being http 1.
123+
EXPECT_CALL(mock_cache_, getConcurrentStreams(_)).WillOnce(Return(1));
124+
125+
// This makes the new connection http 2, which has more than 1 stream available.
126+
testAlpnHandshake(Protocol::Http2);
127+
}
128+
117129
TEST_F(MixedConnPoolImplTest, HandshakeWithCachedLimitCapped) {
118130
EXPECT_CALL(mock_cache_, getConcurrentStreams(_))
119131
.WillOnce(Return(std::numeric_limits<uint32_t>::max()));

0 commit comments

Comments
 (0)