Skip to content

Commit 64ebf0a

Browse files
authored
Cherry-pick #8997 to v1.80.x (#9027)
Original PR: #8997 RELEASE NOTES: * xds/priority: Stop caching child LB policies removed from the configuration. This will help reduce memory and cpu usage when localities are constantly switching between priorities.
1 parent e45ed24 commit 64ebf0a

13 files changed

Lines changed: 816 additions & 240 deletions

File tree

balancer/weightedtarget/weightedtarget_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1348,7 +1348,7 @@ func addressesToAddrs(as []resolver.Address) []string {
13481348
return ret
13491349
}
13501350

1351-
const initIdleBalancerName = "test-init-Idle-balancer"
1351+
const initIdleBalancerName = "test-init-idle-balancer"
13521352

13531353
var errTestInitIdle = fmt.Errorf("init Idle balancer error 0")
13541354

@@ -1388,7 +1388,7 @@ func (s) TestInitialIdle(t *testing.T) {
13881388
"targets": {
13891389
"cluster_1": {
13901390
"weight":1,
1391-
"childPolicy": [{"test-init-Idle-balancer": ""}]
1391+
"childPolicy": [{"test-init-idle-balancer": ""}]
13921392
}
13931393
}
13941394
}`))

internal/balancergroup/balancergroup.go

Lines changed: 43 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -338,6 +338,16 @@ func (bg *BalancerGroup) Add(id string, builder balancer.Builder) {
338338
// closed after timeout. Cleanup work (closing sub-balancer and removing
339339
// subconns) will be done after timeout.
340340
func (bg *BalancerGroup) Remove(id string) {
341+
bg.removeInternal(id, true)
342+
}
343+
344+
// RemoveImmediately removes and closes the balancer with id from the group
345+
// immediately.
346+
func (bg *BalancerGroup) RemoveImmediately(id string) {
347+
bg.removeInternal(id, false)
348+
}
349+
350+
func (bg *BalancerGroup) removeInternal(id string, withCaching bool) {
341351
bg.logger.Infof("Removing child policy for child %q", id)
342352

343353
bg.outgoingMu.Lock()
@@ -356,32 +366,40 @@ func (bg *BalancerGroup) Remove(id string) {
356366
// Unconditionally remove the sub-balancer config from the map.
357367
delete(bg.idToBalancerConfig, id)
358368

359-
if bg.deletedBalancerCache != nil {
360-
if bg.logger.V(2) {
361-
bg.logger.Infof("Adding child policy for child %q to the balancer cache", id)
362-
bg.logger.Infof("Number of items remaining in the balancer cache: %d", bg.deletedBalancerCache.Len())
363-
}
364-
365-
bg.deletedBalancerCache.Add(id, sbToRemove, func() {
369+
if withCaching {
370+
if bg.deletedBalancerCache != nil {
371+
if bg.logger.V(2) {
372+
bg.logger.Infof("Adding child policy for child %q to the balancer cache", id)
373+
}
374+
bg.deletedBalancerCache.Add(id, sbToRemove, func() {
375+
if bg.logger.V(2) {
376+
bg.logger.Infof("Removing child policy for child %q from the balancer cache after timeout", id)
377+
bg.logger.Infof("Number of items remaining in the balancer cache: %d", bg.deletedBalancerCache.Len())
378+
}
379+
380+
// A sub-balancer evicted from the timeout cache needs to closed
381+
// and its subConns need to removed, unconditionally. There is a
382+
// possibility that a sub-balancer might be removed (thereby
383+
// moving it to the cache) around the same time that the
384+
// balancergroup is closed, and by the time we get here the
385+
// balancergroup might be closed. Check for `outgoingStarted ==
386+
// true` at that point can lead to a leaked sub-balancer.
387+
bg.outgoingMu.Lock()
388+
sbToRemove.stopBalancer()
389+
bg.outgoingMu.Unlock()
390+
bg.cleanupSubConns(sbToRemove)
391+
})
366392
if bg.logger.V(2) {
367-
bg.logger.Infof("Removing child policy for child %q from the balancer cache after timeout", id)
368393
bg.logger.Infof("Number of items remaining in the balancer cache: %d", bg.deletedBalancerCache.Len())
369394
}
370-
371-
// A sub-balancer evicted from the timeout cache needs to closed
372-
// and its subConns need to removed, unconditionally. There is a
373-
// possibility that a sub-balancer might be removed (thereby
374-
// moving it to the cache) around the same time that the
375-
// balancergroup is closed, and by the time we get here the
376-
// balancergroup might be closed. Check for `outgoingStarted ==
377-
// true` at that point can lead to a leaked sub-balancer.
378-
bg.outgoingMu.Lock()
379-
sbToRemove.stopBalancer()
380395
bg.outgoingMu.Unlock()
381-
bg.cleanupSubConns(sbToRemove)
382-
})
383-
bg.outgoingMu.Unlock()
384-
return
396+
return
397+
}
398+
399+
// Fall through to remove the sub-balancer with immediate effect if we are not caching.
400+
if bg.logger.V(2) {
401+
bg.logger.Infof("Child policy for child %q was requested to be cached before eventual removal. No such cache exists. Removing right away.", id)
402+
}
385403
}
386404

387405
// Remove the sub-balancer with immediate effect if we are not caching.
@@ -481,7 +499,7 @@ func (bg *BalancerGroup) ResolverError(err error) {
481499
// from map. Delete sc from the map only when state changes to Shutdown. Since
482500
// it's just forwarding the action, there's no need for a removeSubConn()
483501
// wrapper function.
484-
func (bg *BalancerGroup) newSubConn(config *subBalancerWrapper, addrs []resolver.Address, opts balancer.NewSubConnOptions) (balancer.SubConn, error) {
502+
func (bg *BalancerGroup) newSubConn(sbw *subBalancerWrapper, addrs []resolver.Address, opts balancer.NewSubConnOptions) (balancer.SubConn, error) {
485503
// NOTE: if balancer with id was already removed, this should also return
486504
// error. But since we call balancer.stopBalancer when removing the balancer, this
487505
// shouldn't happen.
@@ -493,12 +511,12 @@ func (bg *BalancerGroup) newSubConn(config *subBalancerWrapper, addrs []resolver
493511
var sc balancer.SubConn
494512
oldListener := opts.StateListener
495513
opts.StateListener = func(state balancer.SubConnState) { bg.updateSubConnState(sc, state, oldListener) }
496-
sc, err := bg.cc.NewSubConn(addrs, opts)
514+
sc, err := sbw.ClientConn.NewSubConn(addrs, opts)
497515
if err != nil {
498516
bg.incomingMu.Unlock()
499517
return nil, err
500518
}
501-
bg.scToSubBalancer[sc] = config
519+
bg.scToSubBalancer[sc] = sbw
502520
bg.incomingMu.Unlock()
503521
return sc, nil
504522
}
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
/*
2+
* Copyright 2026 gRPC authors.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package balancergroup_test
18+
19+
import (
20+
"context"
21+
"strings"
22+
"testing"
23+
"time"
24+
25+
"google.golang.org/grpc/balancer"
26+
"google.golang.org/grpc/internal/balancer/stub"
27+
"google.golang.org/grpc/internal/balancergroup"
28+
"google.golang.org/grpc/internal/grpctest"
29+
"google.golang.org/grpc/internal/testutils"
30+
)
31+
32+
type s struct {
33+
grpctest.Tester
34+
}
35+
36+
func Test(t *testing.T) {
37+
grpctest.RunSubTests(t, s{})
38+
}
39+
40+
const (
41+
defaultTestTimeout = 5 * time.Second
42+
defaultTestShortTimeout = 100 * time.Millisecond
43+
)
44+
45+
// Tests verifies that the RemoveImmediately method of balancergroup closes the
46+
// child balancer immediately, while the Remove method does not close the child
47+
// balancer immediately, when the cache is enabled.
48+
func (s) TestBalancerGroup_RemoveImmediately(t *testing.T) {
49+
// Channels to track child balancer creation and closure.
50+
childLBCreated := make(chan string, 1)
51+
childLBClosed := make(chan string, 1)
52+
53+
childLBName1 := strings.ToLower(t.Name()) + "-child-1"
54+
t.Logf("Registering a child balancer with name %q", childLBName1)
55+
stub.Register(childLBName1, stub.BalancerFuncs{
56+
Init: func(*stub.BalancerData) {
57+
childLBCreated <- childLBName1
58+
},
59+
Close: func(*stub.BalancerData) {
60+
childLBClosed <- childLBName1
61+
},
62+
})
63+
64+
childLBName2 := strings.ToLower(t.Name()) + "-child-2"
65+
t.Logf("Registering a child balancer with name %q", childLBName2)
66+
stub.Register(childLBName2, stub.BalancerFuncs{
67+
Init: func(*stub.BalancerData) {
68+
childLBCreated <- childLBName2
69+
},
70+
Close: func(*stub.BalancerData) {
71+
childLBClosed <- childLBName2
72+
},
73+
})
74+
75+
t.Logf("Creating a balancergroup with cache enabled")
76+
tcc := testutils.NewBalancerClientConn(t)
77+
bg := balancergroup.New(balancergroup.Options{
78+
CC: tcc,
79+
BuildOpts: balancer.BuildOptions{},
80+
SubBalancerCloseTimeout: defaultTestTimeout,
81+
})
82+
83+
t.Logf("Adding a child balancer with name %q to the group", "child-1")
84+
bg.AddWithClientConn("child-1", childLBName1, tcc)
85+
select {
86+
case <-childLBCreated:
87+
case <-time.After(defaultTestTimeout):
88+
t.Fatalf("Timeout when waiting for child LB to be created")
89+
}
90+
91+
t.Logf("Adding a child balancer with name %q to the group", "child-2")
92+
bg.AddWithClientConn("child-2", childLBName2, tcc)
93+
select {
94+
case <-childLBCreated:
95+
case <-time.After(defaultTestTimeout):
96+
t.Fatalf("Timeout when waiting for child LB to be created")
97+
}
98+
99+
t.Logf("Removing the child balancer with name %q from the group with immediate effect", "child-1")
100+
bg.RemoveImmediately("child-1")
101+
select {
102+
case <-childLBClosed:
103+
case <-time.After(defaultTestTimeout):
104+
t.Fatalf("Timeout when waiting for child LB to be closed")
105+
}
106+
107+
t.Logf("Removing the child balancer with name %q from the group with caching", "child-2")
108+
bg.Remove("child-2")
109+
sCtx, sCancel := context.WithTimeout(context.Background(), defaultTestShortTimeout)
110+
defer sCancel()
111+
select {
112+
case <-childLBClosed:
113+
t.Fatalf("Child LB closed when expected to be cached")
114+
case <-sCtx.Done():
115+
}
116+
117+
t.Logf("Closing the balancergroup, which should close the second child balancer")
118+
bg.Close()
119+
select {
120+
case <-childLBClosed:
121+
case <-time.After(defaultTestTimeout):
122+
t.Fatalf("Timeout when waiting for child LB to be closed")
123+
}
124+
}

internal/envconfig/envconfig.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,13 @@ var (
119119
// A future release will remove this environment variable, enabling strict
120120
// path checking behavior unconditionally.
121121
DisableStrictPathChecking = boolFromEnv("GRPC_GO_EXPERIMENTAL_DISABLE_STRICT_PATH_CHECKING", false)
122+
123+
// EnablePriorityLBChildPolicyCache controls whether the priority balancer
124+
// should cache child balancers that are removed from the LB policy config,
125+
// for a period of 15 minutes. This is disabled by default, but can be
126+
// enabled by setting the env variable
127+
// GRPC_EXPERIMENTAL_ENABLE_PRIORITY_LB_CHILD_POLICY_CACHE to true.
128+
EnablePriorityLBChildPolicyCache = boolFromEnv("GRPC_EXPERIMENTAL_ENABLE_PRIORITY_LB_CHILD_POLICY_CACHE", false)
122129
)
123130

124131
func boolFromEnv(envVar string, def bool) bool {

internal/xds/balancer/cdsbalancer/configbuilder_childname.go

Lines changed: 42 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -31,9 +31,10 @@ import (
3131
// struct keeps state between generate() calls, and a later generate() might
3232
// return names returned by the previous call.
3333
type nameGenerator struct {
34-
existingNames map[clients.Locality]string
35-
prefix uint64
36-
nextID uint64
34+
prevLocalitiesToChildNames map[clients.Locality]string // locality to child name mapping generated for the previous update
35+
prevChildNames []string // prioritized list of child names generated for the previous update
36+
prefix uint64
37+
nextID uint64
3738
}
3839

3940
func newNameGenerator(prefix uint64) *nameGenerator {
@@ -53,36 +54,58 @@ func newNameGenerator(prefix uint64) *nameGenerator {
5354
// - update 3: [[L1, L2], [L3]] --> ["0", "2"] (Two priorities were merged)
5455
// - update 4: [[L1], [L4]] --> ["0", "3",] (A priority was split, and a new priority was added)
5556
func (ng *nameGenerator) generate(priorities [][]xdsresource.Locality) []string {
56-
var ret []string
57+
ret := make([]string, len(priorities))
5758
usedNames := make(map[string]bool)
5859
newNames := make(map[clients.Locality]string)
59-
for _, priority := range priorities {
60-
var nameFound string
60+
61+
// Pass 1: Same priority index match.
62+
for i, priority := range priorities {
63+
if i >= len(ng.prevChildNames) {
64+
continue
65+
}
66+
targetName := ng.prevChildNames[i]
67+
for _, locality := range priority {
68+
if name, ok := ng.prevLocalitiesToChildNames[locality.ID]; ok && name == targetName {
69+
ret[i] = targetName
70+
usedNames[targetName] = true
71+
break
72+
}
73+
}
74+
}
75+
76+
// Pass 2: Greedy reuse.
77+
for i, priority := range priorities {
78+
if ret[i] != "" {
79+
continue
80+
}
6181
for _, locality := range priority {
62-
if name, ok := ng.existingNames[locality.ID]; ok {
82+
if name, ok := ng.prevLocalitiesToChildNames[locality.ID]; ok {
6383
if !usedNames[name] {
64-
nameFound = name
65-
// Found a name to use. No need to process the remaining
66-
// localities.
84+
ret[i] = name
85+
usedNames[name] = true
6786
break
6887
}
6988
}
7089
}
90+
}
7191

72-
if nameFound == "" {
73-
// No appropriate used name is found. Make a new name.
74-
nameFound = fmt.Sprintf("priority-%d-%d", ng.prefix, ng.nextID)
92+
// Pass 3: New name.
93+
for i, name := range ret {
94+
if name == "" {
95+
newID := fmt.Sprintf("priority-%d-%d", ng.prefix, ng.nextID)
7596
ng.nextID++
97+
ret[i] = newID
98+
usedNames[newID] = true
7699
}
100+
}
77101

78-
ret = append(ret, nameFound)
79-
// All localities in this priority share the same name. Add them all to
80-
// the new map.
102+
// Update state.
103+
for i, priority := range priorities {
81104
for _, l := range priority {
82-
newNames[l.ID] = nameFound
105+
newNames[l.ID] = ret[i]
83106
}
84-
usedNames[nameFound] = true
85107
}
86-
ng.existingNames = newNames
108+
ng.prevLocalitiesToChildNames = newNames
109+
ng.prevChildNames = ret
87110
return ret
88111
}

0 commit comments

Comments
 (0)