@@ -442,7 +442,7 @@ func NewContext(
442
442
conn .dialErr = & roachpb.NodeUnavailableError {}
443
443
}
444
444
})
445
- ctx .removeConn (k .(connKey ), conn )
445
+ ctx .removeConn (conn , k .(connKey ))
446
446
return true
447
447
})
448
448
})
@@ -566,10 +566,14 @@ func (ctx *Context) SetLocalInternalServer(internalServer roachpb.InternalServer
566
566
ctx .localInternalClient = internalClientAdapter {internalServer }
567
567
}
568
568
569
- func (ctx * Context ) removeConn (key connKey , conn * Connection ) {
570
- ctx .conns .Delete (key )
569
+ // removeConn removes the given connection from the pool. The supplied connKeys
570
+ // must represent *all* the keys under among which the connection was shared.
571
+ func (ctx * Context ) removeConn (conn * Connection , keys ... connKey ) {
572
+ for _ , key := range keys {
573
+ ctx .conns .Delete (key )
574
+ }
571
575
if log .V (1 ) {
572
- log .Infof (ctx .masterCtx , "closing %+v" , key )
576
+ log .Infof (ctx .masterCtx , "closing %+v" , keys )
573
577
}
574
578
if grpcConn := conn .grpcConn ; grpcConn != nil {
575
579
if err := grpcConn .Close (); err != nil && ! grpcutil .IsClosedConnection (err ) {
@@ -719,10 +723,10 @@ func (ctx *Context) GRPCDialNode(target string, remoteNodeID roachpb.NodeID) *Co
719
723
}
720
724
721
725
func (ctx * Context ) grpcDialNodeInternal (target string , remoteNodeID roachpb.NodeID ) * Connection {
722
- thisConnKey := connKey {target , remoteNodeID }
723
- value , ok := ctx .conns .Load (thisConnKey )
726
+ thisConnKeys := [] connKey {{ target , remoteNodeID } }
727
+ value , ok := ctx .conns .Load (thisConnKeys [ 0 ] )
724
728
if ! ok {
725
- value , _ = ctx .conns .LoadOrStore (thisConnKey , newConnectionToNodeID (ctx .Stopper , remoteNodeID ))
729
+ value , _ = ctx .conns .LoadOrStore (thisConnKeys [ 0 ] , newConnectionToNodeID (ctx .Stopper , remoteNodeID ))
726
730
if remoteNodeID != 0 {
727
731
// If the first connection established at a target address is
728
732
// for a specific node ID, then we want to reuse that connection
@@ -732,12 +736,25 @@ func (ctx *Context) grpcDialNodeInternal(target string, remoteNodeID roachpb.Nod
732
736
// not strictly required for correctness.) This LoadOrStore will
733
737
// ensure we're registering the connection we just created for
734
738
// future use by these other dials.
735
- _ , _ = ctx .conns .LoadOrStore (connKey {target , 0 }, value )
739
+ //
740
+ // We need to be careful to unregister both connKeys when the
741
+ // connection breaks. Otherwise, we leak the entry below which
742
+ // "simulates" a hard network partition for anyone dialing without
743
+ // the nodeID (gossip).
744
+ //
745
+ // See:
746
+ // https://github.com/cockroachdb/cockroach/issues/37200
747
+ otherKey := connKey {target , 0 }
748
+ if _ , loaded := ctx .conns .LoadOrStore (otherKey , value ); ! loaded {
749
+ thisConnKeys = append (thisConnKeys , otherKey )
750
+ }
736
751
}
737
752
}
738
753
739
754
conn := value .(* Connection )
740
755
conn .initOnce .Do (func () {
756
+ // Either we kick off the heartbeat loop (and clean up when it's done),
757
+ // or we clean up the connKey entries immediately.
741
758
var redialChan <- chan struct {}
742
759
conn .grpcConn , redialChan , conn .dialErr = ctx .GRPCDialRaw (target )
743
760
if conn .dialErr == nil {
@@ -748,13 +765,15 @@ func (ctx *Context) grpcDialNodeInternal(target string, remoteNodeID roachpb.Nod
748
765
if err != nil && ! grpcutil .IsClosedConnection (err ) {
749
766
log .Errorf (masterCtx , "removing connection to %s due to error: %s" , target , err )
750
767
}
751
- ctx .removeConn (thisConnKey , conn )
768
+ ctx .removeConn (conn , thisConnKeys ... )
752
769
})
753
770
}); err != nil {
754
771
conn .dialErr = err
755
- ctx .removeConn (thisConnKey , conn )
756
772
}
757
773
}
774
+ if conn .dialErr != nil {
775
+ ctx .removeConn (conn , thisConnKeys ... )
776
+ }
758
777
})
759
778
760
779
return conn
0 commit comments