ray/src/ray/common/ray_config_def.h at master · ray-project/ray

History

1116 lines (904 loc) · 54.2 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

// Licensed under the Apache License, Version 2.0 (the "License");

// you may not use this file except in compliance with the License.

// You may obtain a copy of the License at

// http://www.apache.org/licenses/LICENSE-2.0

// Unless required by applicable law or agreed to in writing, software

// distributed under the License is distributed on an "AS IS" BASIS,

// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

// See the License for the specific language governing permissions and

// limitations under the License.

// This header file is used to avoid code duplication.

// It can be included multiple times in ray_config.h, and each inclusion

// could use a different definition of the RAY_CONFIG macro.

// Macro definition format: RAY_CONFIG(type, name, default_value).

// NOTE: This file should NOT be included in any file other than ray_config.h.

/// The duration between dumping debug info to logs, or 0 to disable.

RAY_CONFIG(uint64_t, debug_dump_period_milliseconds, 10000)

/// Whether to enable Ray event stats collection.

RAY_CONFIG(bool, event_stats, true)

/// Whether to enable Ray event stats metrics for main services

/// such as gcs and raylet (which today are the sole consumers of

/// this config)

RAY_CONFIG(bool, emit_main_service_metrics, true)

/// Whether to enable cluster authentication.

RAY_CONFIG(bool, enable_cluster_auth, true)

/// Whether to enable token-based authentication for RPC calls.

/// will be converted to AuthenticationMode enum defined in

/// rpc/authentication/authentication_mode.h

/// use GetAuthenticationMode() to get the authentication mode enum value.

RAY_CONFIG(std::string, AUTH_MODE, "disabled")

/// Whether to enable Kubernetes token-based authentication for RPC calls.

RAY_CONFIG(bool, ENABLE_K8S_TOKEN_AUTH, false)

/// The interval of periodic event loop stats print.

/// -1 means the feature is disabled. In this case, stats are available

/// in the associated process's log file.

/// NOTE: This requires event_stats=1.

RAY_CONFIG(int64_t, event_stats_print_interval_ms, 60000)

/// In theory, this is used to detect Ray cookie mismatches.

/// This magic number (hex for "RAY") is used instead of zero, rationale is

/// that it could still be possible that some random program sends an int64_t

/// which is zero, but it's much less likely that a program sends this

/// particular magic number.

RAY_CONFIG(int64_t, ray_cookie, 0x5241590000000000)

/// The duration that a single handler on the event loop can take before a

/// warning is logged that the handler is taking too long.

RAY_CONFIG(int64_t, handler_warning_timeout_ms, 1000)

/// The duration between loads pulled by GCS

RAY_CONFIG(uint64_t, gcs_pull_resource_loads_period_milliseconds, 1000)

/// The duration between reporting resources sent by the raylets.

RAY_CONFIG(uint64_t, raylet_report_resources_period_milliseconds, 100)

/// The duration between raylet check memory pressure and send gc request

RAY_CONFIG(uint64_t, raylet_check_gc_period_milliseconds, 100)

/// Threshold when the node is beyond the memory capacity. If the memory is above the

/// memory_usage_threshold and free space is below the min_memory_free_bytes then

/// it will start killing processes to free up the space.

/// Note: when resource isolation is enabled, the memory usage threshold is set to

/// total memory - system reserved memory (can be specified in ray start) -

/// kill_memory_buffer_bytes. Notice that the formula does not account for object store

/// memory in system reserved memory. To configure the usage threshold, please adjust the

/// system reserved memory in ray start command instead. Ranging from [0, 1]

RAY_CONFIG(float, memory_usage_threshold, 0.95)

/// The interval between runs of the memory usage monitor.

/// Monitor is disabled when this value is 0.

RAY_CONFIG(uint64_t, memory_monitor_refresh_ms, 250)

/// The minimum amount of free space. If the memory is above the

/// memory_usage_threshold and free space is below min_memory_free_bytes then it

/// will start killing processes to free up the space. Disabled if it is -1.

///

/// This value is useful for larger host where the memory_usage_threshold could

/// represent a large chunk of memory, e.g. a host with 64GB of memory and 0.9 threshold

/// means 6.4 GB of the memory will not be usable.

RAY_CONFIG(int64_t, min_memory_free_bytes, (int64_t)-1)

/// The amount of memory to free under the memory usage threshold when

/// killing workers via the worker killing policy.

RAY_CONFIG(uint64_t, kill_memory_buffer_bytes, 3ULL * 1024 * 1024 * 1024) // 3GB

/// The reserved memory bytes for system processes

/// enforced via cgroup memory.min constraint which guarantees

/// that the system processes' memory will not be reclaimed under any conditions.

/// Default is 0, meaning no memory.min constraint is applied.

/// By default, if resource isolation is enabled, system reserved memory

/// will be protected via memory.low instead. Only configure this value

/// if you are certain that you want the min constraint protection.

RAY_CONFIG(int64_t, system_memory_bytes_min, 0)

/// The proportion of total memory the user processes are allowed to use.

/// Enforced by the cgroup memory.high constraint which throttles the

/// user processes' when the threshold is reached.

/// Default is 1.0, meaning the user processes are allowed to use 100% of the total

/// memory. Only configure this value if you are confident that

/// the configuration is desirable. Bad constraint configurations may

/// lead to significant system performance degradation.

RAY_CONFIG(float, user_memory_proportion_high, 1.0)

/// The proportion of total memory the user processes are allowed to use.

/// Enforced by the cgroup memory.max constraint which triggers the

//. kernel OOM killer when the threshold is reached.

/// Default is 1.0, meaning the user processes are allowed to use 100% of the total

/// memory. Only configure this value if you are confident that

/// the configuration is desirable. Bad constraint configurations may

/// lead to significant system performance degradation.

RAY_CONFIG(float, user_memory_proportion_max, 1.0)

/// The TTL for when the task failure entry is considered

/// eligible for garbage collection.

RAY_CONFIG(uint64_t, task_failure_entry_ttl_ms, 15 * 60 * 1000)

/// The number of retries for the task or actor when

/// it fails due to the process being killed when the memory is running low on the node.

/// The process killing is done by memory monitor, which is enabled via

/// memory_monitor_refresh_ms. If the task or actor is not retriable then this value is

/// ignored. This retry counter is only used when the process is killed due to memory, and

/// the retry counter of the task or actor is only used when it fails in other ways

/// that is not related to running out of memory. Retries indefinitely if the value is -1.

RAY_CONFIG(uint64_t, task_oom_retries, -1)

/// Whether to report placement or regular resource usage for an actor.

/// Reporting placement may cause the autoscaler to overestimate the resources

/// required of the cluster, but reporting regular resource may lead to no

/// autoscaling when an actor can't be placed.

/// https://github.com/ray-project/ray/issues/26806

RAY_CONFIG(bool, report_actor_placement_resources, true)

/// Whether to record the creation sites of object references. This adds more

/// information to `ray memory`, but introduces a little extra overhead when

/// creating object references (e.g. 5~10 microsec per call in Python).

/// TODO: maybe group this under RAY_DEBUG.

RAY_CONFIG(bool, record_ref_creation_sites, false)

/// Collects the stacktrace of the task invocation, or actor creation. The stacktrace is

/// serialized into the TaskSpec and is viewable from the Dashboard. Default is disabled.

RAY_CONFIG(bool, record_task_actor_creation_sites, false)

/// Objects that have been unpinned are

/// added to a local cache. When the cache is flushed, all objects in the cache

/// will be eagerly evicted in a batch by freeing all plasma copies in the

/// cluster. If set, then this is the duration between attempts to flush the

/// local cache. If this is set to 0, then the objects will be freed as soon as

/// they enter the cache. To disable eager eviction, set this to -1.

/// NOTE(swang): The timer is checked by the raylet during every heartbeat, so

/// this should be set to a value larger than

/// raylet_heartbeat_period_milliseconds.

RAY_CONFIG(int64_t, free_objects_period_milliseconds, 1000)

/// Objects that have been unpinned are

/// added to a local cache. When the cache is flushed, all objects in the cache

/// will be eagerly evicted in a batch by freeing all plasma copies in the

/// cluster. This is the maximum number of objects in the local cache before it

/// is flushed. To disable eager eviction, set free_objects_period_milliseconds

/// to -1.

RAY_CONFIG(size_t, free_objects_batch_size, 100)

/// Whether to pin object lineage, i.e. the task that created the object and

/// the task's recursive dependencies. If this is set to true, then the system

/// will attempt to reconstruct the object from its lineage if the object is

/// lost.

RAY_CONFIG(bool, lineage_pinning_enabled, true)

/// Maximum amount of lineage to keep in bytes. This includes the specs of all

/// tasks that have previously already finished but that may be retried again.

/// If we reach this limit, 50% of the current lineage will be evicted and

/// objects that are still in scope will no longer be reconstructed if lost.

/// Each task spec is on the order of 1KB but can be much larger if it has many

/// inlined args.

RAY_CONFIG(int64_t, max_lineage_bytes, 1024 * 1024 * 1024)

/// Whether to re-populate plasma memory. This avoids memory allocation failures

/// at runtime (SIGBUS errors creating new objects), however it will use more memory

/// upfront and can slow down Ray startup.

/// See also: https://github.com/ray-project/ray/issues/14182

RAY_CONFIG(bool, preallocate_plasma_memory, false)

// If true, we place a soft cap on the number of scheduling classes, see

// `worker_cap_initial_backoff_delay_ms`.

RAY_CONFIG(bool, worker_cap_enabled, true)

/// We place a soft cap on the number of tasks of a given scheduling class that

/// can run at once to limit the total number of worker processes. After the

/// specified interval, the new task above that cap is allowed to run. The time

/// before the next tasks (above the cap) are allowed to run increases

/// exponentially. The soft cap is needed to prevent deadlock in the case where

/// a task begins to execute and tries to `ray.get` another task of the same

/// class.

RAY_CONFIG(int64_t, worker_cap_initial_backoff_delay_ms, 1000)

/// After reaching the worker cap, the backoff delay will grow exponentially,

/// until it hits a maximum delay.

RAY_CONFIG(int64_t, worker_cap_max_backoff_delay_ms, 1000 * 10)

/// The fraction of resource utilization on a node after which the scheduler starts

/// to prefer spreading tasks to other nodes. This balances between locality and

/// even balancing of load. Low values (min 0.0) encourage more load spreading.

RAY_CONFIG(float, scheduler_spread_threshold, 0.5)

/// Used by the default hybrid policy only. The scheduler will randomly pick

/// one node from the top k in the cluster to improve load balancing. The

/// scheduler guarantees k is at least equal to this fraction * the number of

/// nodes in the cluster.

RAY_CONFIG(float, scheduler_top_k_fraction, 0.2);

/// Used by the default hybrid policy only. The scheduler will randomly pick

/// one node from the top k in the cluster to improve load balancing. The

/// scheduler guarantees k is at least equal to scheduler_top_k_absolute.

RAY_CONFIG(int32_t, scheduler_top_k_absolute, 1);

/// Whether to only report the usage of pinned copies of objects in the

/// object_store_memory resource. This means nodes holding secondary copies only

/// will become eligible for removal in the autoscaler.

RAY_CONFIG(bool, scheduler_report_pinned_bytes_only, true)

// The max allowed size in bytes of a return object from direct actor calls.

// Objects larger than this size will be spilled/promoted to plasma.

RAY_CONFIG(int64_t, max_direct_call_object_size, 100 * 1024)

// The max gRPC message size (the gRPC internal default is 4MB). We use a higher

// limit in Ray to avoid crashing with many small inlined task arguments.

// Keep in sync with GCS_STORAGE_MAX_SIZE in packaging.py.

RAY_CONFIG(size_t, max_grpc_message_size, 512 * 1024 * 1024)

// The max gRPC message size (the gRPC internal default is 4MB) in communication with the

// Agent.

// NOTE: This has to be kept in sync with AGENT_GRPC_MAX_MESSAGE_LENGTH in

// ray_constants.py

RAY_CONFIG(int64_t, agent_max_grpc_message_size, 20 * 1024 * 1024)

// Retry timeout for trying to create a gRPC server. Only applies if the number

// of retries is non zero.

RAY_CONFIG(int64_t, grpc_server_retry_timeout_milliseconds, 1000)

// Whether to allow HTTP proxy on GRPC clients. Disable HTTP proxy by default since it

// disrupts local connections. Note that this config item only controls GrpcClient in

// `src/ray/rpc/grpc_client.h`. Python GRPC clients are not directly controlled by this.

// NOTE (kfstorm): DO NOT set this config item via `_system_config`, use

// `RAY_grpc_enable_http_proxy` environment variable instead so that it can be passed to

// non-C++ children processes such as dashboard agent.

RAY_CONFIG(bool, grpc_enable_http_proxy, false)

/// Warn if more than this many tasks are queued for submission to an actor.

/// It likely indicates a bug in the user code.

RAY_CONFIG(uint64_t, actor_excess_queueing_warn_threshold, 5000)

/// When trying to resolve an object, the initial period that the raylet will

/// wait before contacting the object's owner to check if the object is still

/// available. This is a lower bound on the time to report the loss of an

/// object stored in the distributed object store in the case that the worker

/// that created the original ObjectRef dies.

RAY_CONFIG(int64_t, object_timeout_milliseconds, 100)

/// The maximum duration that workers can hold on to another worker's lease

/// for direct task submission until it must be returned to the raylet.

RAY_CONFIG(int64_t, worker_lease_timeout_milliseconds, 500)

/// The interval at which the workers will check if their raylet has gone down.

/// When this happens, they will kill themselves.

RAY_CONFIG(uint64_t, raylet_death_check_interval_milliseconds, 1000)

/// These are used by the worker to set the interval for checking signals and

/// batching requests when getting objects.

RAY_CONFIG(int64_t, get_check_signal_interval_milliseconds, 1000)

RAY_CONFIG(int64_t, worker_fetch_request_size, 10000)

/// How long to wait for a fetch to complete during ray.get before warning the

/// user.

RAY_CONFIG(int64_t, fetch_warn_timeout_milliseconds, 60000)

/// How long to wait for a fetch before timing it out and throwing an error to

/// the user. This error should only be seen if there is extreme pressure on

/// the object directory, or if there is a bug in either object recovery or the

/// object directory.

RAY_CONFIG(int64_t, fetch_fail_timeout_milliseconds, 600000)

/// Temporary workaround for https://github.com/ray-project/ray/pull/16402.

RAY_CONFIG(bool, yield_plasma_lock_workaround, true)

/// Number of times raylet client tries connecting to a raylet.

RAY_CONFIG(int64_t, raylet_client_num_connect_attempts, 10)

RAY_CONFIG(int64_t, raylet_client_connect_timeout_milliseconds, 1000)

/// The duration that we wait after sending a worker SIGTERM before sending

/// the worker SIGKILL.

RAY_CONFIG(int64_t, kill_worker_timeout_milliseconds, 5000)

/// Timeout for graceful actor shutdown (e.g. when actor goes out of scope).

/// If an actor does not gracefully shut down within this timeout, it will be force

/// killed. Set to -1 for infinite timeout to prevent the actor from being force killed

/// during graceful shutdown.

RAY_CONFIG(int64_t, actor_graceful_shutdown_timeout_ms, 30000)

/// The duration that we wait after the worker is launched before the

/// starting_worker_timeout_callback() is called.

RAY_CONFIG(int64_t, worker_register_timeout_seconds, 60)

/// The maximum workers raylet can start at the same time.

/// 0 means it will use the default (number of CPUs).

RAY_CONFIG(int64_t, worker_maximum_startup_concurrency, 0)

/// Maximum number of retries for pop worker before the task is

/// cancelled. 0 means no retry (fail immediately), default is 5.

/// Retries indefinitely if the value is -1.

RAY_CONFIG(int32_t, pop_worker_max_retries, 5)

/// The maximum number of workers to iterate whenever we analyze the resources usage.

RAY_CONFIG(uint32_t, worker_max_resource_analysis_iteration, 128)

/// The maximum number of generator returns. We are using this to pre-reserve

/// Ray object ID indexes.

/// The first N indexes are for num_returns.

/// The next max_num_generator_returns indexes are for generator return.

/// The rest of them is for ray.put.

RAY_CONFIG(uint32_t, max_num_generator_returns, 100 * 1000 * 1000)

/// A value to add to workers' OOM score adjustment, so that the OS prioritizes

/// killing these over the raylet. 0 or positive values only (negative values

/// require sudo permissions).

/// NOTE(swang): Linux only.

RAY_CONFIG(int, worker_oom_score_adjustment, 1000)

/// Sets workers' nice value on posix systems, so that the OS prioritizes CPU for other

/// processes over worker. This makes CPU available to GCS, Raylet and user processes

/// even when workers are busy.

/// Valid value is [0, 19] (negative values require sudo permissions).

/// NOTE: Linux, Unix and MacOS only.

RAY_CONFIG(int, worker_niceness, 15)

/// Allow at least 60 seconds for connecting to Redis.

RAY_CONFIG(int64_t, redis_db_connect_retries, 120)

RAY_CONFIG(int64_t, redis_db_connect_wait_milliseconds, 500)

/// Number of retries for a redis request failure.

RAY_CONFIG(size_t, num_redis_request_retries, 5)

/// Exponential backoff setup. By default:

/// 100ms, 200ms, 400ms, 800ms, 1s, 1s,...

RAY_CONFIG(int64_t, redis_retry_base_ms, 100)

RAY_CONFIG(int64_t, redis_retry_multiplier, 2)

RAY_CONFIG(int64_t, redis_retry_max_ms, 1000)

/// The object manager's global timer interval in milliseconds.

RAY_CONFIG(int, object_manager_timer_freq_ms, 100)

/// Timeout, in milliseconds, to wait before retrying a failed pull in the

/// ObjectManager.

RAY_CONFIG(int, object_manager_pull_timeout_ms, 10000)

/// Timeout, in milliseconds, to wait until the Push request fails.

/// Special value:

/// Negative: waiting infinitely.

/// 0: giving up retrying immediately.

RAY_CONFIG(int, object_manager_push_timeout_ms, 10000)

/// Default chunk size for multi-chunk transfers to use in the object manager.

/// In the object manager, no single thread is permitted to transfer more

/// data than what is specified by the chunk size unless the number of object

/// chunks exceeds the number of available sending threads.

/// NOTE(ekl): this has been raised to lower broadcast overheads.

RAY_CONFIG(uint64_t, object_manager_default_chunk_size, 5 * 1024 * 1024)

/// The maximum number of outbound bytes to allow to be outstanding. This avoids

/// excessive memory usage during object broadcast to many receivers.

RAY_CONFIG(uint64_t,

object_manager_max_bytes_in_flight,

((uint64_t)2) * 1024 * 1024 * 1024)

/// Maximum number of ids in one batch to send to GCS to delete keys.

RAY_CONFIG(uint32_t, maximum_gcs_deletion_batch_size, 1000)

/// Maximum number of items in one batch to scan/get/delete from GCS storage.

RAY_CONFIG(uint32_t, maximum_gcs_storage_operation_batch_size, 1000)

/// When getting objects from object store, max number of ids to print in the warning

/// message.

RAY_CONFIG(uint32_t, object_store_get_max_ids_to_print_in_warning, 20)

/// Number of polling threads used by rpc server in gcs server. These threads poll for

/// requests and copy from the socket buffer to create the proto request object.

RAY_CONFIG(uint32_t,

gcs_server_rpc_server_thread_num,

std::max(1U, std::thread::hardware_concurrency() / 4U))

/// Number of polling threads for raylet + worker clients on the GCS. These threads poll

/// for replies and copy from the socket buffer to create the proto Reply object.

RAY_CONFIG(uint32_t,

gcs_server_rpc_client_thread_num,

std::max(1U, std::thread::hardware_concurrency() / 4U))

/// The interval at which the gcs server will health check the connection to the

/// external Redis server. If a health check fails, the GCS will crash itself.

/// Set to zero to disable health checking.

RAY_CONFIG(uint64_t, gcs_redis_heartbeat_interval_milliseconds, 100)

/// Duration to wait between retries for leasing worker in gcs server.

RAY_CONFIG(uint32_t, gcs_lease_worker_retry_interval_ms, 200)

/// Duration to wait between retries for creating actor in gcs server.

RAY_CONFIG(uint32_t, gcs_create_actor_retry_interval_ms, 200)

/// Exponential backoff params for gcs to retry creating a placement group

RAY_CONFIG(uint64_t, gcs_create_placement_group_retry_min_interval_ms, 100)

RAY_CONFIG(uint64_t, gcs_create_placement_group_retry_max_interval_ms, 1000)

RAY_CONFIG(double, gcs_create_placement_group_retry_multiplier, 1.5)

/// Maximum number of destroyed actors in GCS server memory cache.

RAY_CONFIG(uint32_t, maximum_gcs_destroyed_actor_cached_count, 100000)

/// Maximum number of dead nodes in GCS server memory cache.

RAY_CONFIG(uint32_t, maximum_gcs_dead_node_cached_count, 1000)

// The storage backend to use for the GCS. It can be either 'redis' or 'memory'.

RAY_CONFIG(std::string, gcs_storage, "memory")

/// Duration to sleep after failing to put an object in plasma because it is full.

RAY_CONFIG(uint32_t, object_store_full_delay_ms, 10)

/// The threshold to trigger a global gc

RAY_CONFIG(double, plasma_store_usage_trigger_gc_threshold, 0.7)

/// The amount of time between automatic local Python GC triggers.

RAY_CONFIG(uint64_t, local_gc_interval_s, 90 * 60)

/// The min amount of time between local GCs (whether auto or mem pressure triggered).

RAY_CONFIG(uint64_t, local_gc_min_interval_s, 10)

/// The min amount of time between triggering global_gc in raylet. This only applies

/// to global GCs triggered due to plasma_store_usage_trigger_gc_threshold.

RAY_CONFIG(uint64_t, global_gc_min_interval_s, 30)

/// Duration to wait between retries for failed tasks.

RAY_CONFIG(uint32_t, task_retry_delay_ms, 0)

/// The base retry delay for exponential backoff when the task fails due to OOM.

/// No delay if this value is zero.

RAY_CONFIG(uint32_t, task_oom_retry_delay_base_ms, 1000)

/// The base retry delay for exponential backoff when an actor task fails with

/// ACTOR_UNAVAILABLE (e.g., actor is restarting or network error).

RAY_CONFIG(uint32_t, task_actor_unavailable_retry_delay_base_ms, 100)

/// The maximum retry delay for ACTOR_UNAVAILABLE exponential backoff.

RAY_CONFIG(uint32_t, task_actor_unavailable_retry_max_delay_ms, 5000)

/// Duration to wait between retrying to kill a task.

RAY_CONFIG(uint32_t, cancellation_retry_ms, 2000)

/// Determines if forking in Ray actors / tasks are supported.

/// Note that this only enables forking in workers, but not drivers.

RAY_CONFIG(bool, support_fork, false)

/// Maximum timeout for GCS reconnection in seconds.

/// Each reconnection ping will be retried every 1 second.

RAY_CONFIG(uint32_t, gcs_rpc_server_reconnect_timeout_s, 60)

/// The timeout for GCS connection in seconds

RAY_CONFIG(int32_t, gcs_rpc_server_connect_timeout_s, 5)

/// gRPC channel reconnection related configs to GCS.

/// Check https://grpc.github.io/grpc/core/group__grpc__arg__keys.html for details

/// Note: `gcs_grpc_min_reconnect_backoff_ms` is (mis)used by gRPC as the connection

/// timeout. If your cluster has a high latency, make it to > 4x the latency.

RAY_CONFIG(int32_t, gcs_grpc_max_reconnect_backoff_ms, 2000)

RAY_CONFIG(int32_t, gcs_grpc_min_reconnect_backoff_ms, 1000)

RAY_CONFIG(int32_t, gcs_grpc_initial_reconnect_backoff_ms, 100)

/// Maximum bytes of request queued when RPC failed due to GCS is down.

/// If reach the limit, the core worker will hang until GCS is reconnected.

/// By default, the value if 5GB.

RAY_CONFIG(uint64_t, gcs_grpc_max_request_queued_max_bytes, 1024UL * 1024 * 1024 * 5)

/// The duration between two checks for grpc status.

RAY_CONFIG(int32_t, grpc_client_check_connection_status_interval_milliseconds, 1000)

/// Due to the protocol drawback, raylet needs to refresh the message if

/// no message is received for a while.

/// Refer to https://tinyurl.com/n6kvsp87 for more details

RAY_CONFIG(int64_t, ray_syncer_message_refresh_interval_ms, 3000)

/// The batch size for metrics export.

/// Normally each time-series << 1Kb. Batch size of 10_000 means expected payload

/// will be under 10Mb.

RAY_CONFIG(int64_t, metrics_report_batch_size, 10000)

/// If task events (status change and profiling events) from driver should be ignored.

/// Currently for testing only.

RAY_CONFIG(bool, task_events_skip_driver_for_test, false)

/// The interval duration for which task state events will be reported to GCS.

/// The reported data should only be used for observability.

/// Setting the value to 0 disables the task event recording and reporting.

RAY_CONFIG(int64_t, task_events_report_interval_ms, 1000)

/// The interval duration for which ray events will be reported to the event aggregator.

/// The reported data should only be used for observability.

/// Setting the value to 0 disables the ray event recording and reporting.

RAY_CONFIG(int64_t, ray_events_report_interval_ms, 1000)

/// The number of tasks tracked in GCS for task state events. Any additional events

/// from new tasks will evict events of tasks reported earlier.

/// Setting the value to -1 allows for unlimited task events stored in GCS.

RAY_CONFIG(int64_t, task_events_max_num_task_in_gcs, 100000)

/// The number of task attempts being dropped per job tracked at GCS. When GCS is forced

/// to stop tracking some task attempts that are lost, this will incur potential partial

/// data loss for a single task attempt (e.g. some task events were dropped, but some were

/// tracked). When this happens, users should be cautious of inconsistency in the task

/// events data.

RAY_CONFIG(int64_t,

task_events_max_dropped_task_attempts_tracked_per_job_in_gcs,

1 * 1000 * 1000)

/// Max number of task status events stored on

/// workers. Events will be evicted based on a FIFO order.

RAY_CONFIG(uint64_t, task_events_max_num_status_events_buffer_on_worker, 100 * 1000)

/// Max number of task status events that will be stored to export

/// for the export API. Events will be evicted based on a FIFO order.

RAY_CONFIG(uint64_t,

task_events_max_num_export_status_events_buffer_on_worker,

1000 * 1000)

/// Max number of task events to be send in a single message to GCS. This caps both

/// the message size, and also the processing work on GCS.

RAY_CONFIG(uint64_t, task_events_send_batch_size, 10 * 1000)

/// Max number of task events to be written in a single flush iteration. This

/// caps the number of file writes per iteration.

RAY_CONFIG(uint64_t, export_task_events_write_batch_size, 10 * 1000)

/// Max number of profile events allowed to be tracked for a single task.

/// Setting the value to -1 allows unlimited profile events to be tracked.

RAY_CONFIG(int64_t, task_events_max_num_profile_events_per_task, 1000)

/// The max number of profile events allowed to be stored in the buffer on the worker

/// side. Events will be evicted based on a FIFO order.

RAY_CONFIG(uint64_t, task_events_max_num_profile_events_buffer_on_worker, 10 * 1000)

/// Max number of task attempts being dropped on the worker side to report to GCS.

/// Setting the value to -1 allows unlimited dropped task attempts in a single

/// report to GCS.

RAY_CONFIG(int64_t, task_events_dropped_task_attempt_batch_size, 10 * 1000)

/// Timeout in milliseconds to wait for task events to be flushed during shutdown.

/// During graceful shutdown, the TaskEventBuffer and RayEventRecorder will wait up to

/// this duration for in-flight gRPC calls to complete before stopping the io_service.

RAY_CONFIG(int64_t, task_events_shutdown_flush_timeout_ms, 5000)

/// The delay in ms that GCS should mark any running tasks from a job as failed.

/// Setting this value too smaller might result in some finished tasks marked as failed by

/// GCS.

RAY_CONFIG(uint64_t, gcs_mark_task_failed_on_job_done_delay_ms, /* 15 secs */ 1000 * 15)

/// The delay in ms that GCS should mark any running tasks from a dead worker failed.

/// Setting this value too smaller might result in some finished tasks marked as failed by

/// GCS since task events data are pushed to GCS asynchronously.

RAY_CONFIG(uint64_t, gcs_mark_task_failed_on_worker_dead_delay_ms, /* 1 secs */ 1000 * 1)

/// Whether or not we enable metrics collection.

RAY_CONFIG(bool, enable_metrics_collection, true)

/// Determine if the high cardinality labels such as WorkerId, task and actor Name

/// should be used in the metrics. For the complete definition, see

/// RAY_METRIC_CARDINALITY_LEVEL in ray_constants.py

RAY_CONFIG(std::string, metric_cardinality_level, "legacy")

/// Whether enable OpenTelemetry as the metrics collection backend. The default is

/// using OpenCensus.

RAY_CONFIG(bool, enable_open_telemetry, true)

/// Whether to disable the OpenTelemetry SDK logs. They are disabled by default

/// to prevent noisy gRPC errors during shutdown.

/// See https://github.com/ray-project/ray/issues/58256 for details.

RAY_CONFIG(bool, disable_open_telemetry_sdk_log, true)

/// Whether to enable Ray Event as the event collection backend. The default is

/// using the Export API.

RAY_CONFIG(bool, enable_ray_event, false)

RAY_CONFIG(uint64_t, ray_event_recorder_max_queued_events, 10000)

/// Comma separated list of components we enable grpc metrics collection for.

/// Only effective if `enable_metrics_collection` is also true. Will have some performance

/// degredations.

///

/// Valid fields: "gcs".

/// TODO: it only works for gcs now. The goal is to do "gcs,core_worker,raylet.". The

/// problem is we need this config field *before* any grpc call, but raylet and

/// core_worker received configs from gcs and raylet respectively, so the configs are only

/// available *after* a grpc call.

RAY_CONFIG(std::string, enable_grpc_metrics_collection_for, "")

/// Only effective if `enable_metrics_collection` is also true.

///

/// If > 0, we monitor each instrumented_io_context every

/// `io_context_event_loop_lag_collection_interval_ms` milliseconds, by posting a task to

/// the io_context to measure the duration from post to run. The metric is

/// `ray_io_context_event_loop_lag_ms`.

///

/// A probe task is only posted after a previous probe task has completed.

RAY_CONFIG(int64_t, io_context_event_loop_lag_collection_interval_ms, 10000)

// Max number bytes of inlined objects in a task rpc request/response.

RAY_CONFIG(int64_t, task_rpc_inlined_bytes_limit, 10 * 1024 * 1024)

/// Maximum number of pending lease requests per scheduling category

/// -1 means that Ray should automatically set this to the number of nodes in

/// the cluster.

RAY_CONFIG(int64_t, max_pending_lease_requests_per_scheduling_category, -1)

/// Wait timeout for dashboard agent register.

#ifdef _WIN32

// agent startup time can involve creating conda environments

RAY_CONFIG(uint32_t, agent_register_timeout_ms, 100 * 1000)

#else

RAY_CONFIG(uint32_t, agent_register_timeout_ms, 30 * 1000)

#endif

/// If true, agent checks the health of parent by reading pipe.

/// If false, it checks the parent pid using psutil.

RAY_CONFIG(bool, enable_pipe_based_agent_to_parent_health_check, true)

/// If the agent manager fails to communicate with the dashboard agent or the runtime env

/// agent, we will retry after this interval.

RAY_CONFIG(uint32_t, agent_manager_retry_interval_ms, 1000)

/// The maximum number of resource shapes included in the resource

/// load reported by each raylet.

RAY_CONFIG(int64_t, max_resource_shapes_per_load_report, 100)

/// The timeout for synchronous GCS requests in seconds.

RAY_CONFIG(int64_t, gcs_server_request_timeout_seconds, 60)

/// Whether to enable worker prestarting: https://github.com/ray-project/ray/issues/12052

RAY_CONFIG(bool, enable_worker_prestart, false)

/// Whether to enable worker prestarting on first driver

/// TODO(clarng): reconcile with enable_worker_prestart

RAY_CONFIG(bool, prestart_worker_first_driver, true)

/// The interval of periodic idle worker killing. Value of 0 means worker capping is

/// disabled.

RAY_CONFIG(uint64_t, kill_idle_workers_interval_ms, 200)

/// The idle time threshold for an idle worker to be killed.

RAY_CONFIG(int64_t, idle_worker_killing_time_threshold_ms, 1000)

/// The soft limit of the number of workers to keep around.

/// We apply this limit to the idle workers instead of total workers,

/// because the total number of workers used depends on the

/// application. -1 means using the available number of CPUs.

RAY_CONFIG(int64_t, num_workers_soft_limit, -1)

// The interval where metrics are exported in milliseconds.

RAY_CONFIG(uint64_t, metrics_report_interval_ms, 10000)

/// Enable the task timeline. If this is enabled, certain events such as task

/// execution are profiled and sent to the GCS.

/// This requires RAY_task_events_report_interval_ms=0, so that events will

/// be sent to GCS.

RAY_CONFIG(bool, enable_timeline, true)

/// The maximum number of pending placement group entries that are reported to monitor to

/// autoscale the cluster.

RAY_CONFIG(int64_t, max_placement_group_load_report_size, 1000)

/* Configuration parameters for object spilling. */

/// JSON configuration that describes the external storage. This is passed to

/// Python IO workers to determine how to store/restore an object to/from

/// external storage.

RAY_CONFIG(std::string, object_spilling_config, "")

/// The path to spill objects to. The same path will be used as the object store

/// fallback directory as well. When both object_spilling_config and

/// object_spilling_directory are set, object_spilling_directory will take

/// precedence. When object_spilling_directory is set ray.init() or ray start as well,

/// the directory set with ray.init() or ray start will take precedence.

RAY_CONFIG(std::string, object_spilling_directory, "")

/// Log an ERROR-level message about spilling every time this amount of bytes has been

/// spilled, with exponential increase in interval. This can be set to zero to disable.

RAY_CONFIG(int64_t, verbose_spill_logs, 2L * 1024 * 1024 * 1024)

/// Whether to enable automatic object spilling. If enabled, then

/// Ray will choose objects to spill when the object store is out of

/// memory.

RAY_CONFIG(bool, automatic_object_spilling_enabled, true)

/// The maximum number of I/O worker that raylet starts.

RAY_CONFIG(int, max_io_workers, 4)

/// Ray's object spilling fuses small objects into a single file before flushing them

/// to optimize the performance.

/// Ray will try to spill at least this size or up to max_fused_object_count. 100 MB by

/// default. This value is not recommended to set beyond --object-store-memory.

RAY_CONFIG(int64_t, min_spilling_size, 100 * 1024 * 1024)

/// Maximum size (bytes) of a single spilled file (i.e. one spill worker request).

/// When > 0, the raylet caps the total bytes fused into a single spill request.

/// This helps avoid generating very large spill files that may be hard to delete promptly

/// when multiple object references keep them alive (to avoid disk out of space).

/// Trade-off: smaller caps reduce spill fusion and can lower effective spill throughput

/// due to higher per-file overhead. If spilling cannot keep up with allocation under

/// memory pressure, this may increase the likelihood of object store OOMs.

/// Set to -1 to disable this limit.

RAY_CONFIG(int64_t, max_spilling_file_size_bytes, -1)

/// If set to less than 1.0, Ray will start spilling objects when existing primary objects

/// take more than this percentage of the available memory.

RAY_CONFIG(float, object_spilling_threshold, 0.8)

/// Maximum number of objects that can be fused into a single file.

RAY_CONFIG(int64_t, max_fused_object_count, 2000)

/// Grace period until we throw the OOM error to the application in seconds.

/// In unlimited allocation mode, this is the time delay prior to fallback allocating.

RAY_CONFIG(int64_t, oom_grace_period_s, 2)

/// Whether or not the external storage is the local file system.

/// Note that this value should be overridden based on the storage type

/// specified by object_spilling_config.

RAY_CONFIG(bool, is_external_storage_type_fs, true)

/// Control the capacity threshold for ray local file system (for object store).

/// Once we are over the capacity, all subsequent object creation will fail.

RAY_CONFIG(float, local_fs_capacity_threshold, 0.95)

/// Control the frequency of checking the disk usage.

RAY_CONFIG(uint64_t, local_fs_monitor_interval_ms, 100)

/* Configuration parameters for locality-aware scheduling. */

/// Whether to enable locality-aware leasing. If enabled, then Ray will consider task

/// dependency locality when choosing a worker for leasing.

RAY_CONFIG(bool, locality_aware_leasing_enabled, true)

/* Configuration parameters for logging */

/// Parameters for log rotation. This value is equivalent to RotatingFileHandler's

/// maxBytes argument.

RAY_CONFIG(int64_t, log_rotation_max_bytes, 100 * 1024 * 1024)

/// Parameters for log rotation. This value is equivalent to RotatingFileHandler's

/// backupCount argument.

RAY_CONFIG(int64_t, log_rotation_backup_count, 5)

/// When tasks that can't be sent because of network error. we'll never receive a DEAD

/// notification, in this case we'll wait for a fixed timeout value and then mark it

/// as failed.

RAY_CONFIG(int64_t, timeout_ms_task_wait_for_death_info, 1000)

/// The core worker heartbeat interval. During heartbeat, it'll

/// report the loads to raylet.

RAY_CONFIG(int64_t, core_worker_internal_heartbeat_ms, 1000)

/// Interval at which workers report their backlog of tasks with unresolved dependencies

/// to the local raylet, used for autoscaling decisions.

RAY_CONFIG(int64_t, report_worker_backlog_interval_ms, 1000)

/// Starting timeout for core worker grpc server reconnection (will

/// exponentially increase until the maximum timeout).

RAY_CONFIG(uint32_t, core_worker_rpc_server_reconnect_timeout_base_s, 1)

/// Maximum timeout for core worker grpc server reconnection.

RAY_CONFIG(uint32_t, core_worker_rpc_server_reconnect_timeout_max_s, 60)

/// Maximum amount of memory that will be used by running tasks' args.

RAY_CONFIG(float, max_task_args_memory_fraction, 0.7)

/// The maximum number of objects to publish for each publish calls.

RAY_CONFIG(int, publish_batch_size, 5000)

/// Maximum size in bytes of buffered messages per pubsub channel. Large

/// applications (1k+ nodes, 100k+ tasks or actors) may see memory pressure in

/// the GCS due to high system-level pubsub traffic. Reducing this config value

/// can help reduce memory pressure, at the cost of dropping some published

/// messages (e.g., worker logs printed to driver stdout). See

/// src/ray/pubsub/publisher.cc for the current pubsub channels that are

/// subject to this cap.

RAY_CONFIG(int, publisher_entity_buffer_max_bytes, 1 << 30)

/// The maximum command batch size.

RAY_CONFIG(int64_t, max_command_batch_size, 2000)

/// The maximum batch size for OBOD report.

RAY_CONFIG(int64_t, max_object_report_batch_size, 2000)

/// For Ray publishers, the minimum time to drop an inactive subscriber connection in ms.

/// In the current implementation, a subscriber might be dead for up to 3x the configured

/// time before it is deleted from the publisher, i.e. deleted in 300s ~ 900s.

RAY_CONFIG(uint64_t, subscriber_timeout_ms, 300 * 1000)

// This is the minimum time an actor will remain in the actor table before

// being garbage collected when a job finishes

RAY_CONFIG(uint64_t, gcs_actor_table_min_duration_ms, /* 5 min */ 60 * 1000 * 5)

RAY_CONFIG(uint32_t, max_error_msg_size_bytes, 512 * 1024)

// The number of seconds to wait for the Raylet to start. This is normally

// fast, but when RAY_preallocate_plasma_memory=1 is set, it may take some time

// (a few GB/s) to populate all the pages on Raylet startup.

RAY_CONFIG(uint32_t,

raylet_start_wait_time_s,

std::getenv("RAY_preallocate_plasma_memory") != nullptr &&

std::getenv("RAY_preallocate_plasma_memory") == std::string("1")

? 120

: 30)

/// The scheduler will treat these predefined resource types as unit_instance.

/// Default predefined_unit_instance_resources is "GPU".

/// When set it to "CPU,GPU", we will also treat CPU as unit_instance.

RAY_CONFIG(std::string, predefined_unit_instance_resources, "GPU")

/// The scheduler will treat these custom resource types as unit_instance.

/// This allows the scheduler to provide chip IDs for custom resources like

/// "neuron_cores", "TPUs" and "FPGAs".

/// Default custom_unit_instance_resources is "neuron_cores,TPU".

/// When set it to "neuron_cores,TPU,FPGA", we will also treat FPGA as unit_instance.

RAY_CONFIG(std::string, custom_unit_instance_resources, "neuron_cores,TPU,NPU,HPU,RBLN")

/// The name of the system-created concurrency group for actors. This group is

/// created with 1 thread, and is created lazily. The intended usage is for

/// Ray-internal auxiliary tasks (e.g., compiled graph workers).

RAY_CONFIG(std::string, system_concurrency_group_name, "_ray_system")

/// ServerCall instance number of each RPC service handler

///

/// NOTE: Default value is temporarily pegged at `gcs_server_rpc_server_thread_num * 100`

/// to keep it at the level it has been prior to

/// https://github.com/ray-project/ray/pull/47664

RAY_CONFIG(int64_t,

gcs_max_active_rpcs_per_handler,

gcs_server_rpc_server_thread_num() * 100)

/// grpc keepalive sent interval for server.

/// This is only configured in GCS server now.

RAY_CONFIG(int64_t, grpc_keepalive_time_ms, 10000)

/// grpc keepalive timeout.

RAY_CONFIG(int64_t, grpc_keepalive_timeout_ms, 20000)

/// NOTE: we set a loose client keep alive because

/// they have a failure model that considers network failures as component failures

/// and this configuration break that assumption. We should apply to every other component

/// after we change this failure assumption from code.

/// grpc keepalive timeout for client.

RAY_CONFIG(int64_t, grpc_client_keepalive_time_ms, 300000)

/// grpc keepalive timeout for client.

RAY_CONFIG(int64_t, grpc_client_keepalive_timeout_ms, 120000)

RAY_CONFIG(int64_t, grpc_client_idle_timeout_ms, 1800000)

/// grpc streaming buffer size

/// Set it to 512kb

RAY_CONFIG(int64_t, grpc_stream_buffer_size, 512 * 1024);

/// Whether to use log reporter in event framework

RAY_CONFIG(bool, event_log_reporter_enabled, true)

/// Whether or not we should also write an event log to a log file.

/// This has no effect if `event_log_reporter_enabled` is false.

RAY_CONFIG(bool, emit_event_to_log_file, false)

/// Event severity threshold value

RAY_CONFIG(std::string, event_level, "warning")

/// Whether to avoid scheduling cpu requests on gpu nodes

RAY_CONFIG(bool, scheduler_avoid_gpu_nodes, true)

/// Whether to skip running local GC in runtime env.

RAY_CONFIG(bool, runtime_env_skip_local_gc, false)

/// The namespace for the storage.

/// This fields is used to isolate data stored in DB.

RAY_CONFIG(std::string, external_storage_namespace, "default")

/// Whether or not use TLS.

RAY_CONFIG(bool, USE_TLS, false)

/// Location of TLS credentials

RAY_CONFIG(std::string, TLS_SERVER_CERT, "")

RAY_CONFIG(std::string, TLS_SERVER_KEY, "")

RAY_CONFIG(std::string, TLS_CA_CERT, "")

/// Location of Redis TLS credentials

/// https://github.com/redis/hiredis/blob/c78d0926bf169670d15cfc1214e4f5d21673396b/README.md#hiredis-openssl-wrappers

RAY_CONFIG(std::string, REDIS_CA_CERT, "")

RAY_CONFIG(std::string, REDIS_CA_PATH, "")

RAY_CONFIG(std::string, REDIS_CLIENT_CERT, "")

RAY_CONFIG(std::string, REDIS_CLIENT_KEY, "")

RAY_CONFIG(std::string, REDIS_SERVER_NAME, "")

/// grpc delay testing flags

/// To use this,

/// export RAY_testing_asio_delay_us="method1=min_val:max_val,method2=20:100"

// The delay is a random number between the interval. If method equals '*',

// it will apply to all methods.

RAY_CONFIG(std::string, testing_asio_delay_us, "")

/// To use this,

/// export

/// RAY_testing_rpc_failure='{"method1":{"num_failures":X,"req_failure_prob":Y,"resp_failure_prob":Z,"in_flight_failure_prob":W}}'

///

/// If you want to test all RPC failures you can use * as the method name and you can set

/// -1 num_failures to have unlimited failures.

/// Ex. unlimited failures for all RPCs with 25% request failures, 50% response

/// failures, and 10% in-flight failures.

/// export

/// RAY_testing_rpc_failure='{"*":{"num_failures":-1,"req_failure_prob":25,"resp_failure_prob":50,"in_flight_failure_prob":10}}'

/// This will set the probabilities for all RPCs to 25% for request failures, 50% for

/// response failures, and 10% for in-flight failures.

/// NOTE: Setting the wildcard will override any configuration for other methods.

///

/// You can also provide an optional fifth, sixth, and/or seventh parameter to specify

/// that there should be at least a certain amount of failures.

// The 5th parameter is for request failures.

// The 6th parameter is for response failures.

// The 7th parameter is for in-flight failures.

/// By default these are set to 0, but by setting them to positive values it guarantees

/// that the first X request RPCs will fail, followed by Y response RPCs that will fail,

/// followed by Z in-flight RPCs that will fail.

/// Afterwards, it will revert to the probabilistic failures. You can combine this with

/// the wildcard so that each RPC method will have the same lower bounds applied.

///

/// Ex. unlimited failures for all RPCs with 25% request failures, 50% response failures,

/// and 10% in-flight failures with at least 2 request failures, 3 response failures, and

/// 1 in-flight failure:

/// export

/// RAY_testing_rpc_failure='{"*":{"num_failures":-1,"req_failure_prob":25,"resp_failure_prob":50,"in_flight_failure_prob":10,"num_lower_bound_req_failures":2,"num_lower_bound_resp_failures":3,"num_lower_bound_in_flight_failures":1}}'

RAY_CONFIG(std::string, testing_rpc_failure, "")

/// If this is set, when injecting RPC failures, we'll check if the server and client have

/// the same address. If they do, we won't inject the failure.

RAY_CONFIG(bool, testing_rpc_failure_avoid_intra_node_failures, false)

/// The following are configs for the health check. They are borrowed

/// from k8s health probe (shorturl.at/jmTY3)

/// The delay to send the first health check.

RAY_CONFIG(int64_t, health_check_initial_delay_ms, 5000)

/// The interval between two health check.

RAY_CONFIG(int64_t, health_check_period_ms, 3000)

/// The timeout for a health check.

RAY_CONFIG(int64_t, health_check_timeout_ms, 10000)

/// The threshold to consider a node dead.

RAY_CONFIG(int64_t, health_check_failure_threshold, 5)

/// Thread pool size for sending replies in grpc server (system components: raylet, GCS).

RAY_CONFIG(int64_t,

num_server_call_thread,

std::max((int64_t)1, (int64_t)(std::thread::hardware_concurrency() / 4U)))

/// Thread pool size for sending replies in grpc server (CoreWorkers).

/// https://github.com/ray-project/ray/issues/58351 shows the

/// reply path is light enough that 2 threads is sufficient.

RAY_CONFIG(int64_t,

core_worker_num_server_call_thread,

std::thread::hardware_concurrency() >= 8 ? 2 : 1);

/// Use madvise to prevent worker/raylet coredumps from including

/// the mapped plasma pages.

RAY_CONFIG(bool, worker_core_dump_exclude_plasma_store, true)

RAY_CONFIG(bool, raylet_core_dump_exclude_plasma_store, true)

// Instruct the Python default worker to preload the specified imports.

// This is specified as a comma-separated list.

// If left empty, no such attempt will be made.

// Example: RAY_preload_python_modules=tensorflow,pytorch

RAY_CONFIG(std::vector<std::string>, preload_python_modules, {})

// By default, raylet send a self liveness check to GCS every 60s

RAY_CONFIG(int64_t, raylet_liveness_self_check_interval_ms, 60000)

// Instruct the CoreWorker to kill its child processes while

// it exits. This prevents certain classes of resource leaks

// that are caused by the worker processes leaking processes.

// If a user relies on Ray's old behavior of leaking processes,

// then they can disable this behavior with

// RAY_kill_child_processes_on_worker_exit=false. We anticipate

// keeping this flag around at least until Ray 2.5.

// See https://github.com/ray-project/ray/pull/33976 for more

// info.

RAY_CONFIG(bool, kill_child_processes_on_worker_exit, true)

// Make Raylet and CoreWorker to become Linux subreaper, and let Raylet to kill

// the child processes of the worker when the worker exits. This is useful for

// the case where the worker crashed and had no chance to clean up its child processes.

// Only works on Linux>=3.4. On other platforms, this flag is ignored.

// See https://github.com/ray-project/ray/pull/42992 for more info.

RAY_CONFIG(bool, kill_child_processes_on_worker_exit_with_raylet_subreaper, false)

View remainder of file in raw view

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

FilesExpand file tree

ray_config_def.h

Latest commit

History

ray_config_def.h

File metadata and controls