2020import static com .google .common .collect .ImmutableSet .toImmutableSet ;
2121import static org .openqa .selenium .grid .data .Availability .DOWN ;
2222import static org .openqa .selenium .grid .data .Availability .DRAINING ;
23+ import static org .openqa .selenium .grid .data .Availability .UP ;
2324import static org .openqa .selenium .internal .Debug .getDebugLogLevel ;
2425import static org .openqa .selenium .remote .RemoteTags .CAPABILITIES ;
2526import static org .openqa .selenium .remote .RemoteTags .CAPABILITIES_EVENT ;
4445import org .openqa .selenium .concurrent .GuardedRunnable ;
4546import org .openqa .selenium .events .EventBus ;
4647import org .openqa .selenium .grid .config .Config ;
48+ import org .openqa .selenium .grid .data .Availability ;
4749import org .openqa .selenium .grid .data .CreateSessionRequest ;
4850import org .openqa .selenium .grid .data .CreateSessionResponse ;
4951import org .openqa .selenium .grid .data .DistributorStatus ;
9193
9294import java .io .Closeable ;
9395import java .io .UncheckedIOException ;
96+ import java .net .URI ;
9497import java .time .Duration ;
9598import java .util .ArrayList ;
9699import java .util .Collection ;
@@ -297,8 +300,10 @@ public LocalDistributor add(Node node) {
297300
298301 // An exception occurs if Node heartbeat has started but the server is not ready.
299302 // Unhandled exception blocks the event-bus thread from processing any event henceforth.
303+ NodeStatus initialNodeStatus ;
300304 try {
301- model .add (node .getStatus ());
305+ initialNodeStatus = node .getStatus ();
306+ model .add (initialNodeStatus );
302307 nodes .put (node .getId (), node );
303308 } catch (Exception e ) {
304309 LOG .log (
@@ -309,21 +314,10 @@ public LocalDistributor add(Node node) {
309314 }
310315
311316 // Extract the health check
312- Runnable runnableHealthCheck = asRunnableHealthCheck (node );
313- allChecks .put (node .getId (), runnableHealthCheck );
314-
315- // Running the health check right after the Node registers itself. We retry the
316- // execution because the Node might on a complex network topology. For example,
317- // Kubernetes pods with IPs that take a while before they are reachable.
318- RetryPolicy <Object > initialHealthCheckPolicy = new RetryPolicy <>()
319- .withMaxAttempts (-1 )
320- .withMaxDuration (Duration .ofSeconds (90 ))
321- .withDelay (Duration .ofSeconds (15 ))
322- .abortIf (result -> true );
323-
324- LOG .log (getDebugLogLevel (), "Running initial health check for Node " + node .getUri ());
325- Executors .newSingleThreadExecutor ().submit (
326- () -> Failsafe .with (initialHealthCheckPolicy ).run (runnableHealthCheck ::run ));
317+ Runnable healthCheck = asRunnableHealthCheck (node );
318+ allChecks .put (node .getId (), healthCheck );
319+
320+ updateNodeStatus (initialNodeStatus , healthCheck );
327321
328322 LOG .info (String .format (
329323 "Added node %s at %s. Health check every %ss" ,
@@ -336,6 +330,27 @@ public LocalDistributor add(Node node) {
336330 return this ;
337331 }
338332
333+ private void updateNodeStatus (NodeStatus status , Runnable healthCheck ) {
334+ // Setting the Node as available if the initial call to status was successful.
335+ // Otherwise, retry to have it available as soon as possible.
336+ if (status .getAvailability () == UP ) {
337+ updateNodeAvailability (status .getExternalUri (), status .getNodeId (), status .getAvailability ());
338+ } else {
339+ // Running the health check right after the Node registers itself. We retry the
340+ // execution because the Node might on a complex network topology. For example,
341+ // Kubernetes pods with IPs that take a while before they are reachable.
342+ RetryPolicy <Object > initialHealthCheckPolicy = new RetryPolicy <>()
343+ .withMaxAttempts (-1 )
344+ .withMaxDuration (Duration .ofSeconds (90 ))
345+ .withDelay (Duration .ofSeconds (15 ))
346+ .abortIf (result -> true );
347+
348+ LOG .log (getDebugLogLevel (), "Running health check for Node " + status .getExternalUri ());
349+ Executors .newSingleThreadExecutor ().submit (
350+ () -> Failsafe .with (initialHealthCheckPolicy ).run (healthCheck ::run ));
351+ }
352+ }
353+
339354 private Runnable runNodeHealthChecks () {
340355 return () -> {
341356 ImmutableMap <NodeId , Runnable > nodeHealthChecks = ImmutableMap .copyOf (allChecks );
@@ -363,26 +378,27 @@ private Runnable asRunnableHealthCheck(Node node) {
363378 failedCheckException = e ;
364379 }
365380
366- Lock writeLock = lock .writeLock ();
367- writeLock .lock ();
368- try {
369- LOG .log (
370- getDebugLogLevel (),
371- String .format (
372- "Health check result for %s was %s" ,
373- node .getUri (),
374- result .getAvailability ()));
375- model .setAvailability (id , result .getAvailability ());
376- model .updateHealthCheckCount (id , result .getAvailability ());
377- } finally {
378- writeLock .unlock ();
379- }
381+ updateNodeAvailability (node .getUri (), id , result .getAvailability ());
380382 if (checkFailed ) {
381383 throw new HealthCheckFailedException ("Node " + id , failedCheckException );
382384 }
383385 };
384386 }
385387
388+ private void updateNodeAvailability (URI nodeUri , NodeId id , Availability availability ) {
389+ Lock writeLock = lock .writeLock ();
390+ writeLock .lock ();
391+ try {
392+ LOG .log (
393+ getDebugLogLevel (),
394+ String .format ("Health check result for %s was %s" , nodeUri , availability ));
395+ model .setAvailability (id , availability );
396+ model .updateHealthCheckCount (id , availability );
397+ } finally {
398+ writeLock .unlock ();
399+ }
400+ }
401+
386402 @ Override
387403 public boolean drain (NodeId nodeId ) {
388404 Node node = nodes .get (nodeId );
0 commit comments