16
16
from dask_kubernetes .common .networking import get_scheduler_address
17
17
from dask_kubernetes .aiopykube import HTTPClient , KubeConfig
18
18
from dask_kubernetes .aiopykube .dask import DaskCluster
19
- from distributed .core import rpc
19
+ from distributed .core import rpc , clean_exception
20
+ from distributed .protocol .pickle import dumps
20
21
21
22
_ANNOTATION_NAMESPACES_TO_IGNORE = (
22
23
"kopf.zalando.org" ,
@@ -416,7 +417,7 @@ async def retire_workers(
416
417
)
417
418
418
419
# Otherwise try gracefully retiring via the RPC
419
- logger .info (
420
+ logger .debug (
420
421
f"Scaling { worker_group_name } failed via the HTTP API, falling back to the Dask RPC"
421
422
)
422
423
# Dask version mismatches between the operator and scheduler may cause this to fail in any number of unexpected ways
@@ -435,7 +436,7 @@ async def retire_workers(
435
436
return workers_to_close
436
437
437
438
# Finally fall back to last-in-first-out scaling
438
- logger .info (
439
+ logger .debug (
439
440
f"Scaling { worker_group_name } failed via the Dask RPC, falling back to LIFO scaling"
440
441
)
441
442
async with kubernetes .client .api_client .ApiClient () as api_client :
@@ -447,6 +448,75 @@ async def retire_workers(
447
448
return [w ["metadata" ]["name" ] for w in workers .items [:- n_workers ]]
448
449
449
450
451
+ async def check_scheduler_idle (scheduler_service_name , namespace , logger ):
452
+ # Try getting idle time via HTTP API
453
+ dashboard_address = await get_scheduler_address (
454
+ scheduler_service_name ,
455
+ namespace ,
456
+ port_name = "http-dashboard" ,
457
+ allow_external = False ,
458
+ )
459
+ async with aiohttp .ClientSession () as session :
460
+ url = f"{ dashboard_address } /api/v1/check_idle"
461
+ async with session .get (url ) as resp :
462
+ if resp .status <= 300 :
463
+ idle_since = (await resp .json ())["idle_since" ]
464
+ if idle_since :
465
+ logger .debug ("Scheduler idle since: %s" , idle_since )
466
+ return idle_since
467
+ logger .debug (
468
+ "Received %d response from scheduler API with body %s" ,
469
+ resp .status ,
470
+ await resp .text (),
471
+ )
472
+
473
+ # Otherwise try gracefully checking via the RPC
474
+ logger .debug (
475
+ f"Checking { scheduler_service_name } idleness failed via the HTTP API, falling back to the Dask RPC"
476
+ )
477
+ # Dask version mismatches between the operator and scheduler may cause this to fail in any number of unexpected ways
478
+ with suppress (Exception ):
479
+ comm_address = await get_scheduler_address (
480
+ scheduler_service_name ,
481
+ namespace ,
482
+ allow_external = False ,
483
+ )
484
+ async with rpc (comm_address ) as scheduler_comm :
485
+ idle_since = await scheduler_comm .check_idle ()
486
+ if idle_since :
487
+ logger .debug ("Scheduler idle since: %s" , idle_since )
488
+ return idle_since
489
+
490
+ # Finally fall back to code injection via the Dask RPC for distributed<=2023.3.1
491
+ logger .debug (
492
+ f"Checking { scheduler_service_name } idleness failed via the Dask RPC, falling back to run_on_scheduler"
493
+ )
494
+
495
+ def idle_since (dask_scheduler = None ):
496
+ if not dask_scheduler .idle_timeout :
497
+ dask_scheduler .idle_timeout = 300
498
+ dask_scheduler .check_idle ()
499
+ return dask_scheduler .idle_since
500
+
501
+ comm_address = await get_scheduler_address (
502
+ scheduler_service_name ,
503
+ namespace ,
504
+ allow_external = False ,
505
+ )
506
+ async with rpc (comm_address ) as scheduler_comm :
507
+ response = await scheduler_comm .run_function (
508
+ function = dumps (idle_since ),
509
+ )
510
+ if response ["status" ] == "error" :
511
+ typ , exc , tb = clean_exception (** response )
512
+ raise exc .with_traceback (tb )
513
+ else :
514
+ idle_since = response ["result" ]
515
+ if idle_since :
516
+ logger .debug ("Scheduler idle since: %s" , idle_since )
517
+ return idle_since
518
+
519
+
450
520
async def get_desired_workers (scheduler_service_name , namespace , logger ):
451
521
# Try gracefully retiring via the HTTP API
452
522
dashboard_address = await get_scheduler_address (
@@ -901,3 +971,23 @@ async def daskautoscaler_adapt(spec, name, namespace, logger, **kwargs):
901
971
logger .debug (
902
972
"Not autoscaling %s with %d workers" , spec ["cluster" ], current_replicas
903
973
)
974
+
975
+
976
+ @kopf .timer ("daskcluster.kubernetes.dask.org" , interval = 5.0 )
977
+ async def daskcluster_autoshutdown (spec , name , namespace , logger , ** kwargs ):
978
+ if spec ["idleTimeout" ]:
979
+ try :
980
+ idle_since = await check_scheduler_idle (
981
+ scheduler_service_name = f"{ name } -scheduler" ,
982
+ namespace = namespace ,
983
+ logger = logger ,
984
+ )
985
+ except Exception as e :
986
+ logger .warn ("Unable to connect to scheduler, skipping autoshutdown check." )
987
+ return
988
+ if idle_since and time .time () > idle_since + spec ["idleTimeout" ]:
989
+ api = HTTPClient (KubeConfig .from_env ())
990
+ cluster = await DaskCluster .objects (api , namespace = namespace ).get_by_name (
991
+ name
992
+ )
993
+ await cluster .delete ()
0 commit comments