-
Notifications
You must be signed in to change notification settings - Fork 7.5k
Expand file tree
/
Copy pathray_config_def.h
More file actions
1116 lines (904 loc) · 54.2 KB
/
ray_config_def.h
File metadata and controls
1116 lines (904 loc) · 54.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
// Copyright 2017 The Ray Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// This header file is used to avoid code duplication.
// It can be included multiple times in ray_config.h, and each inclusion
// could use a different definition of the RAY_CONFIG macro.
// Macro definition format: RAY_CONFIG(type, name, default_value).
// NOTE: This file should NOT be included in any file other than ray_config.h.
/// The duration between dumping debug info to logs, or 0 to disable.
RAY_CONFIG(uint64_t, debug_dump_period_milliseconds, 10000)
/// Whether to enable Ray event stats collection.
RAY_CONFIG(bool, event_stats, true)
/// Whether to enable Ray event stats metrics for main services
/// such as gcs and raylet (which today are the sole consumers of
/// this config)
RAY_CONFIG(bool, emit_main_service_metrics, true)
/// Whether to enable cluster authentication.
RAY_CONFIG(bool, enable_cluster_auth, true)
/// Whether to enable token-based authentication for RPC calls.
/// will be converted to AuthenticationMode enum defined in
/// rpc/authentication/authentication_mode.h
/// use GetAuthenticationMode() to get the authentication mode enum value.
RAY_CONFIG(std::string, AUTH_MODE, "disabled")
/// Whether to enable Kubernetes token-based authentication for RPC calls.
RAY_CONFIG(bool, ENABLE_K8S_TOKEN_AUTH, false)
/// The interval of periodic event loop stats print.
/// -1 means the feature is disabled. In this case, stats are available
/// in the associated process's log file.
/// NOTE: This requires event_stats=1.
RAY_CONFIG(int64_t, event_stats_print_interval_ms, 60000)
/// In theory, this is used to detect Ray cookie mismatches.
/// This magic number (hex for "RAY") is used instead of zero, rationale is
/// that it could still be possible that some random program sends an int64_t
/// which is zero, but it's much less likely that a program sends this
/// particular magic number.
RAY_CONFIG(int64_t, ray_cookie, 0x5241590000000000)
/// The duration that a single handler on the event loop can take before a
/// warning is logged that the handler is taking too long.
RAY_CONFIG(int64_t, handler_warning_timeout_ms, 1000)
/// The duration between loads pulled by GCS
RAY_CONFIG(uint64_t, gcs_pull_resource_loads_period_milliseconds, 1000)
/// The duration between reporting resources sent by the raylets.
RAY_CONFIG(uint64_t, raylet_report_resources_period_milliseconds, 100)
/// The duration between raylet check memory pressure and send gc request
RAY_CONFIG(uint64_t, raylet_check_gc_period_milliseconds, 100)
/// Threshold when the node is beyond the memory capacity. If the memory is above the
/// memory_usage_threshold and free space is below the min_memory_free_bytes then
/// it will start killing processes to free up the space.
/// Note: when resource isolation is enabled, the memory usage threshold is set to
/// total memory - system reserved memory (can be specified in ray start) -
/// kill_memory_buffer_bytes. Notice that the formula does not account for object store
/// memory in system reserved memory. To configure the usage threshold, please adjust the
/// system reserved memory in ray start command instead. Ranging from [0, 1]
RAY_CONFIG(float, memory_usage_threshold, 0.95)
/// The interval between runs of the memory usage monitor.
/// Monitor is disabled when this value is 0.
RAY_CONFIG(uint64_t, memory_monitor_refresh_ms, 250)
/// The minimum amount of free space. If the memory is above the
/// memory_usage_threshold and free space is below min_memory_free_bytes then it
/// will start killing processes to free up the space. Disabled if it is -1.
///
/// This value is useful for larger host where the memory_usage_threshold could
/// represent a large chunk of memory, e.g. a host with 64GB of memory and 0.9 threshold
/// means 6.4 GB of the memory will not be usable.
RAY_CONFIG(int64_t, min_memory_free_bytes, (int64_t)-1)
/// The amount of memory to free under the memory usage threshold when
/// killing workers via the worker killing policy.
RAY_CONFIG(uint64_t, kill_memory_buffer_bytes, 3ULL * 1024 * 1024 * 1024) // 3GB
/// The reserved memory bytes for system processes
/// enforced via cgroup memory.min constraint which guarantees
/// that the system processes' memory will not be reclaimed under any conditions.
/// Default is 0, meaning no memory.min constraint is applied.
/// By default, if resource isolation is enabled, system reserved memory
/// will be protected via memory.low instead. Only configure this value
/// if you are certain that you want the min constraint protection.
RAY_CONFIG(int64_t, system_memory_bytes_min, 0)
/// The proportion of total memory the user processes are allowed to use.
/// Enforced by the cgroup memory.high constraint which throttles the
/// user processes' when the threshold is reached.
/// Default is 1.0, meaning the user processes are allowed to use 100% of the total
/// memory. Only configure this value if you are confident that
/// the configuration is desirable. Bad constraint configurations may
/// lead to significant system performance degradation.
RAY_CONFIG(float, user_memory_proportion_high, 1.0)
/// The proportion of total memory the user processes are allowed to use.
/// Enforced by the cgroup memory.max constraint which triggers the
//. kernel OOM killer when the threshold is reached.
/// Default is 1.0, meaning the user processes are allowed to use 100% of the total
/// memory. Only configure this value if you are confident that
/// the configuration is desirable. Bad constraint configurations may
/// lead to significant system performance degradation.
RAY_CONFIG(float, user_memory_proportion_max, 1.0)
/// The TTL for when the task failure entry is considered
/// eligible for garbage collection.
RAY_CONFIG(uint64_t, task_failure_entry_ttl_ms, 15 * 60 * 1000)
/// The number of retries for the task or actor when
/// it fails due to the process being killed when the memory is running low on the node.
/// The process killing is done by memory monitor, which is enabled via
/// memory_monitor_refresh_ms. If the task or actor is not retriable then this value is
/// ignored. This retry counter is only used when the process is killed due to memory, and
/// the retry counter of the task or actor is only used when it fails in other ways
/// that is not related to running out of memory. Retries indefinitely if the value is -1.
RAY_CONFIG(uint64_t, task_oom_retries, -1)
/// Whether to report placement or regular resource usage for an actor.
/// Reporting placement may cause the autoscaler to overestimate the resources
/// required of the cluster, but reporting regular resource may lead to no
/// autoscaling when an actor can't be placed.
/// https://github.com/ray-project/ray/issues/26806
RAY_CONFIG(bool, report_actor_placement_resources, true)
/// Whether to record the creation sites of object references. This adds more
/// information to `ray memory`, but introduces a little extra overhead when
/// creating object references (e.g. 5~10 microsec per call in Python).
/// TODO: maybe group this under RAY_DEBUG.
RAY_CONFIG(bool, record_ref_creation_sites, false)
/// Collects the stacktrace of the task invocation, or actor creation. The stacktrace is
/// serialized into the TaskSpec and is viewable from the Dashboard. Default is disabled.
RAY_CONFIG(bool, record_task_actor_creation_sites, false)
/// Objects that have been unpinned are
/// added to a local cache. When the cache is flushed, all objects in the cache
/// will be eagerly evicted in a batch by freeing all plasma copies in the
/// cluster. If set, then this is the duration between attempts to flush the
/// local cache. If this is set to 0, then the objects will be freed as soon as
/// they enter the cache. To disable eager eviction, set this to -1.
/// NOTE(swang): The timer is checked by the raylet during every heartbeat, so
/// this should be set to a value larger than
/// raylet_heartbeat_period_milliseconds.
RAY_CONFIG(int64_t, free_objects_period_milliseconds, 1000)
/// Objects that have been unpinned are
/// added to a local cache. When the cache is flushed, all objects in the cache
/// will be eagerly evicted in a batch by freeing all plasma copies in the
/// cluster. This is the maximum number of objects in the local cache before it
/// is flushed. To disable eager eviction, set free_objects_period_milliseconds
/// to -1.
RAY_CONFIG(size_t, free_objects_batch_size, 100)
/// Whether to pin object lineage, i.e. the task that created the object and
/// the task's recursive dependencies. If this is set to true, then the system
/// will attempt to reconstruct the object from its lineage if the object is
/// lost.
RAY_CONFIG(bool, lineage_pinning_enabled, true)
/// Maximum amount of lineage to keep in bytes. This includes the specs of all
/// tasks that have previously already finished but that may be retried again.
/// If we reach this limit, 50% of the current lineage will be evicted and
/// objects that are still in scope will no longer be reconstructed if lost.
/// Each task spec is on the order of 1KB but can be much larger if it has many
/// inlined args.
RAY_CONFIG(int64_t, max_lineage_bytes, 1024 * 1024 * 1024)
/// Whether to re-populate plasma memory. This avoids memory allocation failures
/// at runtime (SIGBUS errors creating new objects), however it will use more memory
/// upfront and can slow down Ray startup.
/// See also: https://github.com/ray-project/ray/issues/14182
RAY_CONFIG(bool, preallocate_plasma_memory, false)
// If true, we place a soft cap on the number of scheduling classes, see
// `worker_cap_initial_backoff_delay_ms`.
RAY_CONFIG(bool, worker_cap_enabled, true)
/// We place a soft cap on the number of tasks of a given scheduling class that
/// can run at once to limit the total number of worker processes. After the
/// specified interval, the new task above that cap is allowed to run. The time
/// before the next tasks (above the cap) are allowed to run increases
/// exponentially. The soft cap is needed to prevent deadlock in the case where
/// a task begins to execute and tries to `ray.get` another task of the same
/// class.
RAY_CONFIG(int64_t, worker_cap_initial_backoff_delay_ms, 1000)
/// After reaching the worker cap, the backoff delay will grow exponentially,
/// until it hits a maximum delay.
RAY_CONFIG(int64_t, worker_cap_max_backoff_delay_ms, 1000 * 10)
/// The fraction of resource utilization on a node after which the scheduler starts
/// to prefer spreading tasks to other nodes. This balances between locality and
/// even balancing of load. Low values (min 0.0) encourage more load spreading.
RAY_CONFIG(float, scheduler_spread_threshold, 0.5)
/// Used by the default hybrid policy only. The scheduler will randomly pick
/// one node from the top k in the cluster to improve load balancing. The
/// scheduler guarantees k is at least equal to this fraction * the number of
/// nodes in the cluster.
RAY_CONFIG(float, scheduler_top_k_fraction, 0.2);
/// Used by the default hybrid policy only. The scheduler will randomly pick
/// one node from the top k in the cluster to improve load balancing. The
/// scheduler guarantees k is at least equal to scheduler_top_k_absolute.
RAY_CONFIG(int32_t, scheduler_top_k_absolute, 1);
/// Whether to only report the usage of pinned copies of objects in the
/// object_store_memory resource. This means nodes holding secondary copies only
/// will become eligible for removal in the autoscaler.
RAY_CONFIG(bool, scheduler_report_pinned_bytes_only, true)
// The max allowed size in bytes of a return object from direct actor calls.
// Objects larger than this size will be spilled/promoted to plasma.
RAY_CONFIG(int64_t, max_direct_call_object_size, 100 * 1024)
// The max gRPC message size (the gRPC internal default is 4MB). We use a higher
// limit in Ray to avoid crashing with many small inlined task arguments.
// Keep in sync with GCS_STORAGE_MAX_SIZE in packaging.py.
RAY_CONFIG(size_t, max_grpc_message_size, 512 * 1024 * 1024)
// The max gRPC message size (the gRPC internal default is 4MB) in communication with the
// Agent.
//
// NOTE: This has to be kept in sync with AGENT_GRPC_MAX_MESSAGE_LENGTH in
// ray_constants.py
RAY_CONFIG(int64_t, agent_max_grpc_message_size, 20 * 1024 * 1024)
// Retry timeout for trying to create a gRPC server. Only applies if the number
// of retries is non zero.
RAY_CONFIG(int64_t, grpc_server_retry_timeout_milliseconds, 1000)
// Whether to allow HTTP proxy on GRPC clients. Disable HTTP proxy by default since it
// disrupts local connections. Note that this config item only controls GrpcClient in
// `src/ray/rpc/grpc_client.h`. Python GRPC clients are not directly controlled by this.
// NOTE (kfstorm): DO NOT set this config item via `_system_config`, use
// `RAY_grpc_enable_http_proxy` environment variable instead so that it can be passed to
// non-C++ children processes such as dashboard agent.
RAY_CONFIG(bool, grpc_enable_http_proxy, false)
/// Warn if more than this many tasks are queued for submission to an actor.
/// It likely indicates a bug in the user code.
RAY_CONFIG(uint64_t, actor_excess_queueing_warn_threshold, 5000)
/// When trying to resolve an object, the initial period that the raylet will
/// wait before contacting the object's owner to check if the object is still
/// available. This is a lower bound on the time to report the loss of an
/// object stored in the distributed object store in the case that the worker
/// that created the original ObjectRef dies.
RAY_CONFIG(int64_t, object_timeout_milliseconds, 100)
/// The maximum duration that workers can hold on to another worker's lease
/// for direct task submission until it must be returned to the raylet.
RAY_CONFIG(int64_t, worker_lease_timeout_milliseconds, 500)
/// The interval at which the workers will check if their raylet has gone down.
/// When this happens, they will kill themselves.
RAY_CONFIG(uint64_t, raylet_death_check_interval_milliseconds, 1000)
/// These are used by the worker to set the interval for checking signals and
/// batching requests when getting objects.
RAY_CONFIG(int64_t, get_check_signal_interval_milliseconds, 1000)
RAY_CONFIG(int64_t, worker_fetch_request_size, 10000)
/// How long to wait for a fetch to complete during ray.get before warning the
/// user.
RAY_CONFIG(int64_t, fetch_warn_timeout_milliseconds, 60000)
/// How long to wait for a fetch before timing it out and throwing an error to
/// the user. This error should only be seen if there is extreme pressure on
/// the object directory, or if there is a bug in either object recovery or the
/// object directory.
RAY_CONFIG(int64_t, fetch_fail_timeout_milliseconds, 600000)
/// Temporary workaround for https://github.com/ray-project/ray/pull/16402.
RAY_CONFIG(bool, yield_plasma_lock_workaround, true)
/// Number of times raylet client tries connecting to a raylet.
RAY_CONFIG(int64_t, raylet_client_num_connect_attempts, 10)
RAY_CONFIG(int64_t, raylet_client_connect_timeout_milliseconds, 1000)
/// The duration that we wait after sending a worker SIGTERM before sending
/// the worker SIGKILL.
RAY_CONFIG(int64_t, kill_worker_timeout_milliseconds, 5000)
/// Timeout for graceful actor shutdown (e.g. when actor goes out of scope).
/// If an actor does not gracefully shut down within this timeout, it will be force
/// killed. Set to -1 for infinite timeout to prevent the actor from being force killed
/// during graceful shutdown.
RAY_CONFIG(int64_t, actor_graceful_shutdown_timeout_ms, 30000)
/// The duration that we wait after the worker is launched before the
/// starting_worker_timeout_callback() is called.
RAY_CONFIG(int64_t, worker_register_timeout_seconds, 60)
/// The maximum workers raylet can start at the same time.
/// 0 means it will use the default (number of CPUs).
RAY_CONFIG(int64_t, worker_maximum_startup_concurrency, 0)
/// Maximum number of retries for pop worker before the task is
/// cancelled. 0 means no retry (fail immediately), default is 5.
/// Retries indefinitely if the value is -1.
RAY_CONFIG(int32_t, pop_worker_max_retries, 5)
/// The maximum number of workers to iterate whenever we analyze the resources usage.
RAY_CONFIG(uint32_t, worker_max_resource_analysis_iteration, 128)
/// The maximum number of generator returns. We are using this to pre-reserve
/// Ray object ID indexes.
/// The first N indexes are for num_returns.
/// The next max_num_generator_returns indexes are for generator return.
/// The rest of them is for ray.put.
RAY_CONFIG(uint32_t, max_num_generator_returns, 100 * 1000 * 1000)
/// A value to add to workers' OOM score adjustment, so that the OS prioritizes
/// killing these over the raylet. 0 or positive values only (negative values
/// require sudo permissions).
/// NOTE(swang): Linux only.
RAY_CONFIG(int, worker_oom_score_adjustment, 1000)
/// Sets workers' nice value on posix systems, so that the OS prioritizes CPU for other
/// processes over worker. This makes CPU available to GCS, Raylet and user processes
/// even when workers are busy.
/// Valid value is [0, 19] (negative values require sudo permissions).
/// NOTE: Linux, Unix and MacOS only.
RAY_CONFIG(int, worker_niceness, 15)
/// Allow at least 60 seconds for connecting to Redis.
RAY_CONFIG(int64_t, redis_db_connect_retries, 120)
RAY_CONFIG(int64_t, redis_db_connect_wait_milliseconds, 500)
/// Number of retries for a redis request failure.
RAY_CONFIG(size_t, num_redis_request_retries, 5)
/// Exponential backoff setup. By default:
/// 100ms, 200ms, 400ms, 800ms, 1s, 1s,...
RAY_CONFIG(int64_t, redis_retry_base_ms, 100)
RAY_CONFIG(int64_t, redis_retry_multiplier, 2)
RAY_CONFIG(int64_t, redis_retry_max_ms, 1000)
/// The object manager's global timer interval in milliseconds.
RAY_CONFIG(int, object_manager_timer_freq_ms, 100)
/// Timeout, in milliseconds, to wait before retrying a failed pull in the
/// ObjectManager.
RAY_CONFIG(int, object_manager_pull_timeout_ms, 10000)
/// Timeout, in milliseconds, to wait until the Push request fails.
/// Special value:
/// Negative: waiting infinitely.
/// 0: giving up retrying immediately.
RAY_CONFIG(int, object_manager_push_timeout_ms, 10000)
/// Default chunk size for multi-chunk transfers to use in the object manager.
/// In the object manager, no single thread is permitted to transfer more
/// data than what is specified by the chunk size unless the number of object
/// chunks exceeds the number of available sending threads.
/// NOTE(ekl): this has been raised to lower broadcast overheads.
RAY_CONFIG(uint64_t, object_manager_default_chunk_size, 5 * 1024 * 1024)
/// The maximum number of outbound bytes to allow to be outstanding. This avoids
/// excessive memory usage during object broadcast to many receivers.
RAY_CONFIG(uint64_t,
object_manager_max_bytes_in_flight,
((uint64_t)2) * 1024 * 1024 * 1024)
/// Maximum number of ids in one batch to send to GCS to delete keys.
RAY_CONFIG(uint32_t, maximum_gcs_deletion_batch_size, 1000)
/// Maximum number of items in one batch to scan/get/delete from GCS storage.
RAY_CONFIG(uint32_t, maximum_gcs_storage_operation_batch_size, 1000)
/// When getting objects from object store, max number of ids to print in the warning
/// message.
RAY_CONFIG(uint32_t, object_store_get_max_ids_to_print_in_warning, 20)
/// Number of polling threads used by rpc server in gcs server. These threads poll for
/// requests and copy from the socket buffer to create the proto request object.
RAY_CONFIG(uint32_t,
gcs_server_rpc_server_thread_num,
std::max(1U, std::thread::hardware_concurrency() / 4U))
/// Number of polling threads for raylet + worker clients on the GCS. These threads poll
/// for replies and copy from the socket buffer to create the proto Reply object.
RAY_CONFIG(uint32_t,
gcs_server_rpc_client_thread_num,
std::max(1U, std::thread::hardware_concurrency() / 4U))
/// The interval at which the gcs server will health check the connection to the
/// external Redis server. If a health check fails, the GCS will crash itself.
/// Set to zero to disable health checking.
RAY_CONFIG(uint64_t, gcs_redis_heartbeat_interval_milliseconds, 100)
/// Duration to wait between retries for leasing worker in gcs server.
RAY_CONFIG(uint32_t, gcs_lease_worker_retry_interval_ms, 200)
/// Duration to wait between retries for creating actor in gcs server.
RAY_CONFIG(uint32_t, gcs_create_actor_retry_interval_ms, 200)
/// Exponential backoff params for gcs to retry creating a placement group
RAY_CONFIG(uint64_t, gcs_create_placement_group_retry_min_interval_ms, 100)
RAY_CONFIG(uint64_t, gcs_create_placement_group_retry_max_interval_ms, 1000)
RAY_CONFIG(double, gcs_create_placement_group_retry_multiplier, 1.5)
/// Maximum number of destroyed actors in GCS server memory cache.
RAY_CONFIG(uint32_t, maximum_gcs_destroyed_actor_cached_count, 100000)
/// Maximum number of dead nodes in GCS server memory cache.
RAY_CONFIG(uint32_t, maximum_gcs_dead_node_cached_count, 1000)
// The storage backend to use for the GCS. It can be either 'redis' or 'memory'.
RAY_CONFIG(std::string, gcs_storage, "memory")
/// Duration to sleep after failing to put an object in plasma because it is full.
RAY_CONFIG(uint32_t, object_store_full_delay_ms, 10)
/// The threshold to trigger a global gc
RAY_CONFIG(double, plasma_store_usage_trigger_gc_threshold, 0.7)
/// The amount of time between automatic local Python GC triggers.
RAY_CONFIG(uint64_t, local_gc_interval_s, 90 * 60)
/// The min amount of time between local GCs (whether auto or mem pressure triggered).
RAY_CONFIG(uint64_t, local_gc_min_interval_s, 10)
/// The min amount of time between triggering global_gc in raylet. This only applies
/// to global GCs triggered due to plasma_store_usage_trigger_gc_threshold.
RAY_CONFIG(uint64_t, global_gc_min_interval_s, 30)
/// Duration to wait between retries for failed tasks.
RAY_CONFIG(uint32_t, task_retry_delay_ms, 0)
/// The base retry delay for exponential backoff when the task fails due to OOM.
/// No delay if this value is zero.
RAY_CONFIG(uint32_t, task_oom_retry_delay_base_ms, 1000)
/// The base retry delay for exponential backoff when an actor task fails with
/// ACTOR_UNAVAILABLE (e.g., actor is restarting or network error).
RAY_CONFIG(uint32_t, task_actor_unavailable_retry_delay_base_ms, 100)
/// The maximum retry delay for ACTOR_UNAVAILABLE exponential backoff.
RAY_CONFIG(uint32_t, task_actor_unavailable_retry_max_delay_ms, 5000)
/// Duration to wait between retrying to kill a task.
RAY_CONFIG(uint32_t, cancellation_retry_ms, 2000)
/// Determines if forking in Ray actors / tasks are supported.
/// Note that this only enables forking in workers, but not drivers.
RAY_CONFIG(bool, support_fork, false)
/// Maximum timeout for GCS reconnection in seconds.
/// Each reconnection ping will be retried every 1 second.
RAY_CONFIG(uint32_t, gcs_rpc_server_reconnect_timeout_s, 60)
/// The timeout for GCS connection in seconds
RAY_CONFIG(int32_t, gcs_rpc_server_connect_timeout_s, 5)
/// gRPC channel reconnection related configs to GCS.
/// Check https://grpc.github.io/grpc/core/group__grpc__arg__keys.html for details
/// Note: `gcs_grpc_min_reconnect_backoff_ms` is (mis)used by gRPC as the connection
/// timeout. If your cluster has a high latency, make it to > 4x the latency.
RAY_CONFIG(int32_t, gcs_grpc_max_reconnect_backoff_ms, 2000)
RAY_CONFIG(int32_t, gcs_grpc_min_reconnect_backoff_ms, 1000)
RAY_CONFIG(int32_t, gcs_grpc_initial_reconnect_backoff_ms, 100)
/// Maximum bytes of request queued when RPC failed due to GCS is down.
/// If reach the limit, the core worker will hang until GCS is reconnected.
/// By default, the value if 5GB.
RAY_CONFIG(uint64_t, gcs_grpc_max_request_queued_max_bytes, 1024UL * 1024 * 1024 * 5)
/// The duration between two checks for grpc status.
RAY_CONFIG(int32_t, grpc_client_check_connection_status_interval_milliseconds, 1000)
/// Due to the protocol drawback, raylet needs to refresh the message if
/// no message is received for a while.
/// Refer to https://tinyurl.com/n6kvsp87 for more details
RAY_CONFIG(int64_t, ray_syncer_message_refresh_interval_ms, 3000)
/// The batch size for metrics export.
/// Normally each time-series << 1Kb. Batch size of 10_000 means expected payload
/// will be under 10Mb.
RAY_CONFIG(int64_t, metrics_report_batch_size, 10000)
/// If task events (status change and profiling events) from driver should be ignored.
/// Currently for testing only.
RAY_CONFIG(bool, task_events_skip_driver_for_test, false)
/// The interval duration for which task state events will be reported to GCS.
/// The reported data should only be used for observability.
/// Setting the value to 0 disables the task event recording and reporting.
RAY_CONFIG(int64_t, task_events_report_interval_ms, 1000)
/// The interval duration for which ray events will be reported to the event aggregator.
/// The reported data should only be used for observability.
/// Setting the value to 0 disables the ray event recording and reporting.
RAY_CONFIG(int64_t, ray_events_report_interval_ms, 1000)
/// The number of tasks tracked in GCS for task state events. Any additional events
/// from new tasks will evict events of tasks reported earlier.
/// Setting the value to -1 allows for unlimited task events stored in GCS.
RAY_CONFIG(int64_t, task_events_max_num_task_in_gcs, 100000)
/// The number of task attempts being dropped per job tracked at GCS. When GCS is forced
/// to stop tracking some task attempts that are lost, this will incur potential partial
/// data loss for a single task attempt (e.g. some task events were dropped, but some were
/// tracked). When this happens, users should be cautious of inconsistency in the task
/// events data.
RAY_CONFIG(int64_t,
task_events_max_dropped_task_attempts_tracked_per_job_in_gcs,
1 * 1000 * 1000)
/// Max number of task status events stored on
/// workers. Events will be evicted based on a FIFO order.
RAY_CONFIG(uint64_t, task_events_max_num_status_events_buffer_on_worker, 100 * 1000)
/// Max number of task status events that will be stored to export
/// for the export API. Events will be evicted based on a FIFO order.
RAY_CONFIG(uint64_t,
task_events_max_num_export_status_events_buffer_on_worker,
1000 * 1000)
/// Max number of task events to be send in a single message to GCS. This caps both
/// the message size, and also the processing work on GCS.
RAY_CONFIG(uint64_t, task_events_send_batch_size, 10 * 1000)
/// Max number of task events to be written in a single flush iteration. This
/// caps the number of file writes per iteration.
RAY_CONFIG(uint64_t, export_task_events_write_batch_size, 10 * 1000)
/// Max number of profile events allowed to be tracked for a single task.
/// Setting the value to -1 allows unlimited profile events to be tracked.
RAY_CONFIG(int64_t, task_events_max_num_profile_events_per_task, 1000)
/// The max number of profile events allowed to be stored in the buffer on the worker
/// side. Events will be evicted based on a FIFO order.
RAY_CONFIG(uint64_t, task_events_max_num_profile_events_buffer_on_worker, 10 * 1000)
/// Max number of task attempts being dropped on the worker side to report to GCS.
/// Setting the value to -1 allows unlimited dropped task attempts in a single
/// report to GCS.
RAY_CONFIG(int64_t, task_events_dropped_task_attempt_batch_size, 10 * 1000)
/// Timeout in milliseconds to wait for task events to be flushed during shutdown.
/// During graceful shutdown, the TaskEventBuffer and RayEventRecorder will wait up to
/// this duration for in-flight gRPC calls to complete before stopping the io_service.
RAY_CONFIG(int64_t, task_events_shutdown_flush_timeout_ms, 5000)
/// The delay in ms that GCS should mark any running tasks from a job as failed.
/// Setting this value too smaller might result in some finished tasks marked as failed by
/// GCS.
RAY_CONFIG(uint64_t, gcs_mark_task_failed_on_job_done_delay_ms, /* 15 secs */ 1000 * 15)
/// The delay in ms that GCS should mark any running tasks from a dead worker failed.
/// Setting this value too smaller might result in some finished tasks marked as failed by
/// GCS since task events data are pushed to GCS asynchronously.
RAY_CONFIG(uint64_t, gcs_mark_task_failed_on_worker_dead_delay_ms, /* 1 secs */ 1000 * 1)
/// Whether or not we enable metrics collection.
RAY_CONFIG(bool, enable_metrics_collection, true)
/// Determine if the high cardinality labels such as WorkerId, task and actor Name
/// should be used in the metrics. For the complete definition, see
/// RAY_METRIC_CARDINALITY_LEVEL in ray_constants.py
RAY_CONFIG(std::string, metric_cardinality_level, "legacy")
/// Whether enable OpenTelemetry as the metrics collection backend. The default is
/// using OpenCensus.
RAY_CONFIG(bool, enable_open_telemetry, true)
/// Whether to disable the OpenTelemetry SDK logs. They are disabled by default
/// to prevent noisy gRPC errors during shutdown.
/// See https://github.com/ray-project/ray/issues/58256 for details.
RAY_CONFIG(bool, disable_open_telemetry_sdk_log, true)
/// Whether to enable Ray Event as the event collection backend. The default is
/// using the Export API.
RAY_CONFIG(bool, enable_ray_event, false)
RAY_CONFIG(uint64_t, ray_event_recorder_max_queued_events, 10000)
/// Comma separated list of components we enable grpc metrics collection for.
/// Only effective if `enable_metrics_collection` is also true. Will have some performance
/// degredations.
///
/// Valid fields: "gcs".
/// TODO: it only works for gcs now. The goal is to do "gcs,core_worker,raylet.". The
/// problem is we need this config field *before* any grpc call, but raylet and
/// core_worker received configs from gcs and raylet respectively, so the configs are only
/// available *after* a grpc call.
RAY_CONFIG(std::string, enable_grpc_metrics_collection_for, "")
/// Only effective if `enable_metrics_collection` is also true.
///
/// If > 0, we monitor each instrumented_io_context every
/// `io_context_event_loop_lag_collection_interval_ms` milliseconds, by posting a task to
/// the io_context to measure the duration from post to run. The metric is
/// `ray_io_context_event_loop_lag_ms`.
///
/// A probe task is only posted after a previous probe task has completed.
RAY_CONFIG(int64_t, io_context_event_loop_lag_collection_interval_ms, 10000)
// Max number bytes of inlined objects in a task rpc request/response.
RAY_CONFIG(int64_t, task_rpc_inlined_bytes_limit, 10 * 1024 * 1024)
/// Maximum number of pending lease requests per scheduling category
/// -1 means that Ray should automatically set this to the number of nodes in
/// the cluster.
RAY_CONFIG(int64_t, max_pending_lease_requests_per_scheduling_category, -1)
/// Wait timeout for dashboard agent register.
#ifdef _WIN32
// agent startup time can involve creating conda environments
RAY_CONFIG(uint32_t, agent_register_timeout_ms, 100 * 1000)
#else
RAY_CONFIG(uint32_t, agent_register_timeout_ms, 30 * 1000)
#endif
/// If true, agent checks the health of parent by reading pipe.
/// If false, it checks the parent pid using psutil.
RAY_CONFIG(bool, enable_pipe_based_agent_to_parent_health_check, true)
/// If the agent manager fails to communicate with the dashboard agent or the runtime env
/// agent, we will retry after this interval.
RAY_CONFIG(uint32_t, agent_manager_retry_interval_ms, 1000)
/// The maximum number of resource shapes included in the resource
/// load reported by each raylet.
RAY_CONFIG(int64_t, max_resource_shapes_per_load_report, 100)
/// The timeout for synchronous GCS requests in seconds.
RAY_CONFIG(int64_t, gcs_server_request_timeout_seconds, 60)
/// Whether to enable worker prestarting: https://github.com/ray-project/ray/issues/12052
RAY_CONFIG(bool, enable_worker_prestart, false)
/// Whether to enable worker prestarting on first driver
/// TODO(clarng): reconcile with enable_worker_prestart
RAY_CONFIG(bool, prestart_worker_first_driver, true)
/// The interval of periodic idle worker killing. Value of 0 means worker capping is
/// disabled.
RAY_CONFIG(uint64_t, kill_idle_workers_interval_ms, 200)
/// The idle time threshold for an idle worker to be killed.
RAY_CONFIG(int64_t, idle_worker_killing_time_threshold_ms, 1000)
/// The soft limit of the number of workers to keep around.
/// We apply this limit to the idle workers instead of total workers,
/// because the total number of workers used depends on the
/// application. -1 means using the available number of CPUs.
RAY_CONFIG(int64_t, num_workers_soft_limit, -1)
// The interval where metrics are exported in milliseconds.
RAY_CONFIG(uint64_t, metrics_report_interval_ms, 10000)
/// Enable the task timeline. If this is enabled, certain events such as task
/// execution are profiled and sent to the GCS.
/// This requires RAY_task_events_report_interval_ms=0, so that events will
/// be sent to GCS.
RAY_CONFIG(bool, enable_timeline, true)
/// The maximum number of pending placement group entries that are reported to monitor to
/// autoscale the cluster.
RAY_CONFIG(int64_t, max_placement_group_load_report_size, 1000)
/* Configuration parameters for object spilling. */
/// JSON configuration that describes the external storage. This is passed to
/// Python IO workers to determine how to store/restore an object to/from
/// external storage.
RAY_CONFIG(std::string, object_spilling_config, "")
/// The path to spill objects to. The same path will be used as the object store
/// fallback directory as well. When both object_spilling_config and
/// object_spilling_directory are set, object_spilling_directory will take
/// precedence. When object_spilling_directory is set ray.init() or ray start as well,
/// the directory set with ray.init() or ray start will take precedence.
RAY_CONFIG(std::string, object_spilling_directory, "")
/// Log an ERROR-level message about spilling every time this amount of bytes has been
/// spilled, with exponential increase in interval. This can be set to zero to disable.
RAY_CONFIG(int64_t, verbose_spill_logs, 2L * 1024 * 1024 * 1024)
/// Whether to enable automatic object spilling. If enabled, then
/// Ray will choose objects to spill when the object store is out of
/// memory.
RAY_CONFIG(bool, automatic_object_spilling_enabled, true)
/// The maximum number of I/O worker that raylet starts.
RAY_CONFIG(int, max_io_workers, 4)
/// Ray's object spilling fuses small objects into a single file before flushing them
/// to optimize the performance.
/// Ray will try to spill at least this size or up to max_fused_object_count. 100 MB by
/// default. This value is not recommended to set beyond --object-store-memory.
RAY_CONFIG(int64_t, min_spilling_size, 100 * 1024 * 1024)
/// Maximum size (bytes) of a single spilled file (i.e. one spill worker request).
/// When > 0, the raylet caps the total bytes fused into a single spill request.
/// This helps avoid generating very large spill files that may be hard to delete promptly
/// when multiple object references keep them alive (to avoid disk out of space).
/// Trade-off: smaller caps reduce spill fusion and can lower effective spill throughput
/// due to higher per-file overhead. If spilling cannot keep up with allocation under
/// memory pressure, this may increase the likelihood of object store OOMs.
/// Set to -1 to disable this limit.
RAY_CONFIG(int64_t, max_spilling_file_size_bytes, -1)
/// If set to less than 1.0, Ray will start spilling objects when existing primary objects
/// take more than this percentage of the available memory.
RAY_CONFIG(float, object_spilling_threshold, 0.8)
/// Maximum number of objects that can be fused into a single file.
RAY_CONFIG(int64_t, max_fused_object_count, 2000)
/// Grace period until we throw the OOM error to the application in seconds.
/// In unlimited allocation mode, this is the time delay prior to fallback allocating.
RAY_CONFIG(int64_t, oom_grace_period_s, 2)
/// Whether or not the external storage is the local file system.
/// Note that this value should be overridden based on the storage type
/// specified by object_spilling_config.
RAY_CONFIG(bool, is_external_storage_type_fs, true)
/// Control the capacity threshold for ray local file system (for object store).
/// Once we are over the capacity, all subsequent object creation will fail.
RAY_CONFIG(float, local_fs_capacity_threshold, 0.95)
/// Control the frequency of checking the disk usage.
RAY_CONFIG(uint64_t, local_fs_monitor_interval_ms, 100)
/* Configuration parameters for locality-aware scheduling. */
/// Whether to enable locality-aware leasing. If enabled, then Ray will consider task
/// dependency locality when choosing a worker for leasing.
RAY_CONFIG(bool, locality_aware_leasing_enabled, true)
/* Configuration parameters for logging */
/// Parameters for log rotation. This value is equivalent to RotatingFileHandler's
/// maxBytes argument.
RAY_CONFIG(int64_t, log_rotation_max_bytes, 100 * 1024 * 1024)
/// Parameters for log rotation. This value is equivalent to RotatingFileHandler's
/// backupCount argument.
RAY_CONFIG(int64_t, log_rotation_backup_count, 5)
/// When tasks that can't be sent because of network error. we'll never receive a DEAD
/// notification, in this case we'll wait for a fixed timeout value and then mark it
/// as failed.
RAY_CONFIG(int64_t, timeout_ms_task_wait_for_death_info, 1000)
/// The core worker heartbeat interval. During heartbeat, it'll
/// report the loads to raylet.
RAY_CONFIG(int64_t, core_worker_internal_heartbeat_ms, 1000)
/// Interval at which workers report their backlog of tasks with unresolved dependencies
/// to the local raylet, used for autoscaling decisions.
RAY_CONFIG(int64_t, report_worker_backlog_interval_ms, 1000)
/// Starting timeout for core worker grpc server reconnection (will
/// exponentially increase until the maximum timeout).
RAY_CONFIG(uint32_t, core_worker_rpc_server_reconnect_timeout_base_s, 1)
/// Maximum timeout for core worker grpc server reconnection.
RAY_CONFIG(uint32_t, core_worker_rpc_server_reconnect_timeout_max_s, 60)
/// Maximum amount of memory that will be used by running tasks' args.
RAY_CONFIG(float, max_task_args_memory_fraction, 0.7)
/// The maximum number of objects to publish for each publish calls.
RAY_CONFIG(int, publish_batch_size, 5000)
/// Maximum size in bytes of buffered messages per pubsub channel. Large
/// applications (1k+ nodes, 100k+ tasks or actors) may see memory pressure in
/// the GCS due to high system-level pubsub traffic. Reducing this config value
/// can help reduce memory pressure, at the cost of dropping some published
/// messages (e.g., worker logs printed to driver stdout). See
/// src/ray/pubsub/publisher.cc for the current pubsub channels that are
/// subject to this cap.
RAY_CONFIG(int, publisher_entity_buffer_max_bytes, 1 << 30)
/// The maximum command batch size.
RAY_CONFIG(int64_t, max_command_batch_size, 2000)
/// The maximum batch size for OBOD report.
RAY_CONFIG(int64_t, max_object_report_batch_size, 2000)
/// For Ray publishers, the minimum time to drop an inactive subscriber connection in ms.
/// In the current implementation, a subscriber might be dead for up to 3x the configured
/// time before it is deleted from the publisher, i.e. deleted in 300s ~ 900s.
RAY_CONFIG(uint64_t, subscriber_timeout_ms, 300 * 1000)
// This is the minimum time an actor will remain in the actor table before
// being garbage collected when a job finishes
RAY_CONFIG(uint64_t, gcs_actor_table_min_duration_ms, /* 5 min */ 60 * 1000 * 5)
RAY_CONFIG(uint32_t, max_error_msg_size_bytes, 512 * 1024)
// The number of seconds to wait for the Raylet to start. This is normally
// fast, but when RAY_preallocate_plasma_memory=1 is set, it may take some time
// (a few GB/s) to populate all the pages on Raylet startup.
RAY_CONFIG(uint32_t,
raylet_start_wait_time_s,
std::getenv("RAY_preallocate_plasma_memory") != nullptr &&
std::getenv("RAY_preallocate_plasma_memory") == std::string("1")
? 120
: 30)
/// The scheduler will treat these predefined resource types as unit_instance.
/// Default predefined_unit_instance_resources is "GPU".
/// When set it to "CPU,GPU", we will also treat CPU as unit_instance.
RAY_CONFIG(std::string, predefined_unit_instance_resources, "GPU")
/// The scheduler will treat these custom resource types as unit_instance.
/// This allows the scheduler to provide chip IDs for custom resources like
/// "neuron_cores", "TPUs" and "FPGAs".
/// Default custom_unit_instance_resources is "neuron_cores,TPU".
/// When set it to "neuron_cores,TPU,FPGA", we will also treat FPGA as unit_instance.
RAY_CONFIG(std::string, custom_unit_instance_resources, "neuron_cores,TPU,NPU,HPU,RBLN")
/// The name of the system-created concurrency group for actors. This group is
/// created with 1 thread, and is created lazily. The intended usage is for
/// Ray-internal auxiliary tasks (e.g., compiled graph workers).
RAY_CONFIG(std::string, system_concurrency_group_name, "_ray_system")
/// ServerCall instance number of each RPC service handler
///
/// NOTE: Default value is temporarily pegged at `gcs_server_rpc_server_thread_num * 100`
/// to keep it at the level it has been prior to
/// https://github.com/ray-project/ray/pull/47664
RAY_CONFIG(int64_t,
gcs_max_active_rpcs_per_handler,
gcs_server_rpc_server_thread_num() * 100)
/// grpc keepalive sent interval for server.
/// This is only configured in GCS server now.
RAY_CONFIG(int64_t, grpc_keepalive_time_ms, 10000)
/// grpc keepalive timeout.
RAY_CONFIG(int64_t, grpc_keepalive_timeout_ms, 20000)
/// NOTE: we set a loose client keep alive because
/// they have a failure model that considers network failures as component failures
/// and this configuration break that assumption. We should apply to every other component
/// after we change this failure assumption from code.
/// grpc keepalive timeout for client.
RAY_CONFIG(int64_t, grpc_client_keepalive_time_ms, 300000)
/// grpc keepalive timeout for client.
RAY_CONFIG(int64_t, grpc_client_keepalive_timeout_ms, 120000)
RAY_CONFIG(int64_t, grpc_client_idle_timeout_ms, 1800000)
/// grpc streaming buffer size
/// Set it to 512kb
RAY_CONFIG(int64_t, grpc_stream_buffer_size, 512 * 1024);
/// Whether to use log reporter in event framework
RAY_CONFIG(bool, event_log_reporter_enabled, true)
/// Whether or not we should also write an event log to a log file.
/// This has no effect if `event_log_reporter_enabled` is false.
RAY_CONFIG(bool, emit_event_to_log_file, false)
/// Event severity threshold value
RAY_CONFIG(std::string, event_level, "warning")
/// Whether to avoid scheduling cpu requests on gpu nodes
RAY_CONFIG(bool, scheduler_avoid_gpu_nodes, true)
/// Whether to skip running local GC in runtime env.
RAY_CONFIG(bool, runtime_env_skip_local_gc, false)
/// The namespace for the storage.
/// This fields is used to isolate data stored in DB.
RAY_CONFIG(std::string, external_storage_namespace, "default")
/// Whether or not use TLS.
RAY_CONFIG(bool, USE_TLS, false)
/// Location of TLS credentials
RAY_CONFIG(std::string, TLS_SERVER_CERT, "")
RAY_CONFIG(std::string, TLS_SERVER_KEY, "")
RAY_CONFIG(std::string, TLS_CA_CERT, "")
/// Location of Redis TLS credentials
/// https://github.com/redis/hiredis/blob/c78d0926bf169670d15cfc1214e4f5d21673396b/README.md#hiredis-openssl-wrappers
RAY_CONFIG(std::string, REDIS_CA_CERT, "")
RAY_CONFIG(std::string, REDIS_CA_PATH, "")
RAY_CONFIG(std::string, REDIS_CLIENT_CERT, "")
RAY_CONFIG(std::string, REDIS_CLIENT_KEY, "")
RAY_CONFIG(std::string, REDIS_SERVER_NAME, "")
/// grpc delay testing flags
/// To use this,
/// export RAY_testing_asio_delay_us="method1=min_val:max_val,method2=20:100"
// The delay is a random number between the interval. If method equals '*',
// it will apply to all methods.
RAY_CONFIG(std::string, testing_asio_delay_us, "")
/// To use this,
/// export
/// RAY_testing_rpc_failure='{"method1":{"num_failures":X,"req_failure_prob":Y,"resp_failure_prob":Z,"in_flight_failure_prob":W}}'
///
/// If you want to test all RPC failures you can use * as the method name and you can set
/// -1 num_failures to have unlimited failures.
/// Ex. unlimited failures for all RPCs with 25% request failures, 50% response
/// failures, and 10% in-flight failures.
/// export
/// RAY_testing_rpc_failure='{"*":{"num_failures":-1,"req_failure_prob":25,"resp_failure_prob":50,"in_flight_failure_prob":10}}'
/// This will set the probabilities for all RPCs to 25% for request failures, 50% for
/// response failures, and 10% for in-flight failures.
/// NOTE: Setting the wildcard will override any configuration for other methods.
///
/// You can also provide an optional fifth, sixth, and/or seventh parameter to specify
/// that there should be at least a certain amount of failures.
// The 5th parameter is for request failures.
// The 6th parameter is for response failures.
// The 7th parameter is for in-flight failures.
/// By default these are set to 0, but by setting them to positive values it guarantees
/// that the first X request RPCs will fail, followed by Y response RPCs that will fail,
/// followed by Z in-flight RPCs that will fail.
/// Afterwards, it will revert to the probabilistic failures. You can combine this with
/// the wildcard so that each RPC method will have the same lower bounds applied.
///
/// Ex. unlimited failures for all RPCs with 25% request failures, 50% response failures,
/// and 10% in-flight failures with at least 2 request failures, 3 response failures, and
/// 1 in-flight failure:
/// export
/// RAY_testing_rpc_failure='{"*":{"num_failures":-1,"req_failure_prob":25,"resp_failure_prob":50,"in_flight_failure_prob":10,"num_lower_bound_req_failures":2,"num_lower_bound_resp_failures":3,"num_lower_bound_in_flight_failures":1}}'
RAY_CONFIG(std::string, testing_rpc_failure, "")
/// If this is set, when injecting RPC failures, we'll check if the server and client have
/// the same address. If they do, we won't inject the failure.
RAY_CONFIG(bool, testing_rpc_failure_avoid_intra_node_failures, false)
/// The following are configs for the health check. They are borrowed
/// from k8s health probe (shorturl.at/jmTY3)
/// The delay to send the first health check.
RAY_CONFIG(int64_t, health_check_initial_delay_ms, 5000)
/// The interval between two health check.
RAY_CONFIG(int64_t, health_check_period_ms, 3000)
/// The timeout for a health check.
RAY_CONFIG(int64_t, health_check_timeout_ms, 10000)
/// The threshold to consider a node dead.
RAY_CONFIG(int64_t, health_check_failure_threshold, 5)
/// Thread pool size for sending replies in grpc server (system components: raylet, GCS).
RAY_CONFIG(int64_t,
num_server_call_thread,
std::max((int64_t)1, (int64_t)(std::thread::hardware_concurrency() / 4U)))
/// Thread pool size for sending replies in grpc server (CoreWorkers).
/// https://github.com/ray-project/ray/issues/58351 shows the
/// reply path is light enough that 2 threads is sufficient.
RAY_CONFIG(int64_t,
core_worker_num_server_call_thread,
std::thread::hardware_concurrency() >= 8 ? 2 : 1);
/// Use madvise to prevent worker/raylet coredumps from including
/// the mapped plasma pages.
RAY_CONFIG(bool, worker_core_dump_exclude_plasma_store, true)
RAY_CONFIG(bool, raylet_core_dump_exclude_plasma_store, true)
// Instruct the Python default worker to preload the specified imports.
// This is specified as a comma-separated list.
// If left empty, no such attempt will be made.
// Example: RAY_preload_python_modules=tensorflow,pytorch
RAY_CONFIG(std::vector<std::string>, preload_python_modules, {})
// By default, raylet send a self liveness check to GCS every 60s
RAY_CONFIG(int64_t, raylet_liveness_self_check_interval_ms, 60000)
// Instruct the CoreWorker to kill its child processes while
// it exits. This prevents certain classes of resource leaks
// that are caused by the worker processes leaking processes.
// If a user relies on Ray's old behavior of leaking processes,
// then they can disable this behavior with
// RAY_kill_child_processes_on_worker_exit=false. We anticipate
// keeping this flag around at least until Ray 2.5.
// See https://github.com/ray-project/ray/pull/33976 for more
// info.
RAY_CONFIG(bool, kill_child_processes_on_worker_exit, true)
// Make Raylet and CoreWorker to become Linux subreaper, and let Raylet to kill
// the child processes of the worker when the worker exits. This is useful for
// the case where the worker crashed and had no chance to clean up its child processes.
// Only works on Linux>=3.4. On other platforms, this flag is ignored.
// See https://github.com/ray-project/ray/pull/42992 for more info.
RAY_CONFIG(bool, kill_child_processes_on_worker_exit_with_raylet_subreaper, false)