Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.git/
.venv/
**/__pycache__/
**/*.pyc
4 changes: 2 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ cython_debug/
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
# and can be added to the global gitignore or merged into this file. For a more nuclear
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
.idea/

testing/

Expand All @@ -171,4 +171,4 @@ scripts/exports/

# ignore pyarmor updates
neurons/miners/model/obfuscated/*
neurons/miners/obfuscated_pipeline/**
neurons/miners/obfuscated_pipeline/**
12 changes: 9 additions & 3 deletions deval/api/miner_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,15 @@
model_dir = "/app/eval_llm"
sys.path.append(model_dir) # matches to the location of the mounted directory
model_url = os.getenv("MODEL_URL", "")
model_volume_dir = os.getenv("MODEL_VOLUME_DIR", "")

if model_url != "":
model_dir = HuggingFaceModel.pull_model_and_files(model_url)
if model_url != "" or model_volume_dir:
if model_url:
model_dir = HuggingFaceModel.pull_model_and_files(model_url)
else:
print("Loading model from volume dir")
model_dir = model_volume_dir
sys.path.append(model_dir)

from model.pipeline import DeValPipeline
pipe = DeValPipeline("de_val", model_dir = model_dir)
Expand Down Expand Up @@ -81,4 +87,4 @@ async def get_model_coldkey() -> ModelColdkeyResponse:

@app.get("/health")
async def health()->bool:
return True
return True
7 changes: 2 additions & 5 deletions deval/api/miner_docker_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,9 @@

class MinerDockerClient:

def __init__(self):
def __init__(self, api_url: str = "http://0.0.0.0:8000"):
self.service_name = "miner-api"
self.host = f"http://0.0.0.0"
self.port = 8000
self.api_url = f"{self.host}:{self.port}"
self.api_url = api_url

def _poll_service_for_readiness(self, max_wait_time: int) -> bool:
#TODO: check for errors to stop polling when we know we failed
Expand Down Expand Up @@ -212,4 +210,3 @@ def get_container_size(self):
docker_client.cleanup()



11 changes: 6 additions & 5 deletions deval/base/validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,8 @@ def run(self):

# If someone intentionally stops the validator, it'll safely terminate operations.
except KeyboardInterrupt:
self.axon.stop()
if hasattr(self, "axon"):
self.axon.stop()
bt.logging.success("Validator killed by keyboard interrupt.")
sys.exit()

Expand Down Expand Up @@ -414,12 +415,12 @@ def load_state(self):
self.weights = past_weights.get("past_weights", [])
else:
self.weights = []
# load scores

# load scores
tmp_scores = past_weights.get("scores")
if tmp_scores is not None:
self.scores = tmp_scores

except Exception as e:
bt.logging.warning(f"Unable to load weights data with error: {e}")
self.weights = []
Expand Down Expand Up @@ -466,5 +467,5 @@ def update_scores(self, model_rewards: dict[int, dict[str, list[float]]], denom:
self.scores = (self.scores - constants.alpha_decay).clamp(min=0)
bt.logging.info(f"Updated moving avg scores: {self.scores}")

# return expected format for contest
# return expected format for contest
return [(i, score) for i, score in enumerate(self.scores.tolist())]
124 changes: 124 additions & 0 deletions deval/compute_horde_client.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import pickle
import time

import bittensor as bt
from compute_horde_sdk.v1 import (
ComputeHordeClient as SDKComputeHordeClient,
InlineInputVolume,
HuggingfaceInputVolume,
ExecutorClass,
ComputeHordeJobStatus,
)

from deval.contest import DeValContest
from deval.model.model_state import ModelState
from deval.compute_horde_settings import (
COMPUTE_HORDE_ARTIFACT_OUTPUT_PATH,
COMPUTE_HORDE_FACILITATOR_URL,
COMPUTE_HORDE_JOB_NAMESPACE,
COMPUTE_HORDE_VALIDATOR_HOTKEY,
COMPUTE_HORDE_EXECUTOR_CLASS,
COMPUTE_HORDE_JOB_DOCKER_IMAGE,
COMPUTE_HORDE_JOB_TIMEOUT,
COMPUTE_HORDE_ARTIFACTS_DIR,
COMPUTE_HORDE_VOLUME_MODEL_PATH,
COMPUTE_HORDE_VOLUME_TASK_REPO_DIR,
COMPUTE_HORDE_VOLUME_MINER_STATE_DIR,
COMPUTE_HORDE_VOLUME_MINER_STATE_FILENAME,
COMPUTE_HORDE_VOLUME_TASK_REPO_FILENAME,
)
from deval.task_repository import TaskRepository

_REQUIRED_SETTINGS = (
"COMPUTE_HORDE_JOB_NAMESPACE",
"COMPUTE_HORDE_JOB_DOCKER_IMAGE",
"COMPUTE_HORDE_VALIDATOR_HOTKEY",
)


class ComputeHordeClient:
def __init__(self, keypair: bt.Keypair):
missing_settings = [
setting for setting in _REQUIRED_SETTINGS if not globals().get(setting)
]
if missing_settings:
raise ValueError(
f"Required settings: {', '.join(missing_settings)} are not set. "
"Please set them in your .env file."
)

self.keypair = keypair
self.executor_class = (
ExecutorClass(COMPUTE_HORDE_EXECUTOR_CLASS)
if COMPUTE_HORDE_EXECUTOR_CLASS
else ExecutorClass.always_on__llm__a6000
)
self.client = SDKComputeHordeClient(
hotkey=self.keypair,
compute_horde_validator_hotkey=COMPUTE_HORDE_VALIDATOR_HOTKEY,
**(
{"facilitator_url": COMPUTE_HORDE_FACILITATOR_URL}
if COMPUTE_HORDE_FACILITATOR_URL
else {}
),
)

async def run_epoch_on_compute_horde(
self,
contest: DeValContest,
miner_state: ModelState,
task_repo: TaskRepository,
) -> ModelState:
task_repo_pkl = pickle.dumps(task_repo)

bt.logging.info("Running organic job on Compute Horde.")

job = await self.client.create_job(
executor_class=self.executor_class,
job_namespace=COMPUTE_HORDE_JOB_NAMESPACE,
docker_image=COMPUTE_HORDE_JOB_DOCKER_IMAGE,
args=[
"poetry",
"run",
"python",
"neurons/compute_horde_entrypoint.py",
"--timeout",
str(contest.timeout),
],
artifacts_dir=COMPUTE_HORDE_ARTIFACTS_DIR,
# TODO: Pin huggingface volume revision.
input_volumes={
COMPUTE_HORDE_VOLUME_MINER_STATE_DIR: InlineInputVolume.from_file_contents(
COMPUTE_HORDE_VOLUME_MINER_STATE_FILENAME, pickle.dumps(miner_state)
),
COMPUTE_HORDE_VOLUME_TASK_REPO_DIR: InlineInputVolume.from_file_contents(
COMPUTE_HORDE_VOLUME_TASK_REPO_FILENAME, task_repo_pkl
),
COMPUTE_HORDE_VOLUME_MODEL_PATH: HuggingfaceInputVolume(
repo_id=miner_state.get_model_url()
),
},
)

start = time.time()

await job.wait(timeout=COMPUTE_HORDE_JOB_TIMEOUT)

time_took = time.time() - start

if job.result is None:
raise RuntimeError("Job result is None. Check logs for more details.")

bt.logging.info(job.result.stdout)

if job.status != ComputeHordeJobStatus.COMPLETED:
raise RuntimeError(
f"Job status is {job.status}. Check logs for more details."
)

bt.logging.success(f"Job finished in {time_took} seconds")

model_state_pkl = job.result.artifacts[COMPUTE_HORDE_ARTIFACT_OUTPUT_PATH]

miner_state: ModelState = pickle.loads(model_state_pkl)
return miner_state
28 changes: 28 additions & 0 deletions deval/compute_horde_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
import os

from dotenv import load_dotenv, find_dotenv

load_dotenv(find_dotenv())

COMPUTE_HORDE_VALIDATOR_HOTKEY = os.environ.get("COMPUTE_HORDE_VALIDATOR_HOTKEY")
COMPUTE_HORDE_EXECUTOR_CLASS = os.environ.get("COMPUTE_HORDE_EXECUTOR_CLASS")
COMPUTE_HORDE_FACILITATOR_URL = os.environ.get("COMPUTE_HORDE_FACILITATOR_URL")
COMPUTE_HORDE_JOB_DOCKER_IMAGE = os.environ.get("COMPUTE_HORDE_JOB_DOCKER_IMAGE")
COMPUTE_HORDE_JOB_NAMESPACE = os.environ.get("COMPUTE_HORDE_JOB_NAMESPACE")
COMPUTE_HORDE_JOB_TIMEOUT = int(os.environ.get("COMPUTE_HORDE_JOB_TIMEOUT", 10 * 60))

COMPUTE_HORDE_VOLUME_MINER_STATE_DIR = "/volume/miner_state"
COMPUTE_HORDE_VOLUME_MINER_STATE_FILENAME = "miner_state.pkl"
COMPUTE_HORDE_VOLUME_MINER_STATE_PATH = os.path.join(
COMPUTE_HORDE_VOLUME_MINER_STATE_DIR, COMPUTE_HORDE_VOLUME_MINER_STATE_FILENAME
)
COMPUTE_HORDE_VOLUME_TASK_REPO_DIR = "/volume/task_repo"
COMPUTE_HORDE_VOLUME_TASK_REPO_FILENAME = "task_repo.pkl"
COMPUTE_HORDE_VOLUME_TASK_REPO_PATH = os.path.join(
COMPUTE_HORDE_VOLUME_TASK_REPO_DIR, COMPUTE_HORDE_VOLUME_TASK_REPO_FILENAME
)
COMPUTE_HORDE_VOLUME_MODEL_PATH = "/volume/model"
COMPUTE_HORDE_ARTIFACTS_DIR = "/artifacts"
COMPUTE_HORDE_ARTIFACT_OUTPUT_PATH = os.path.join(
COMPUTE_HORDE_ARTIFACTS_DIR, "output.pkl"
)
10 changes: 5 additions & 5 deletions deval/contest.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ def validate_model(
print(f"Unable to get chain commit data including model hash: {miner_state.chain_model_hash} or block: {miner_state.block}")
return False

if not model_hash or not model_coldkey:
print("Unable to generate model hash or model coldkey, INVALID Model")
return False
# if not model_hash or not model_coldkey:
# print("Unable to generate model hash or model coldkey, INVALID Model")
# return False

if model_coldkey != miner_state.coldkey:
if model_coldkey is not None and model_coldkey != miner_state.coldkey:
print("Mismatch between the Miner's coldkey and the Model's Coldkey. INVALID Model")
return False

if miner_state.chain_model_hash != model_hash:
if model_hash is not None and miner_state.chain_model_hash != model_hash:

Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes seem unnecessary? it's reducing code lines but obfuscating the log (i.e., unable to tease out if missing or a mismatch)

@slawomir-gorawski-reef slawomir-gorawski-reef Mar 4, 2025

Copy link
Copy Markdown
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I did it this way so that it's possible to do some of the checks before we have the model hash or miner coldkey from docker (to avoid downloading the model here and in compute horde later too, like in my other comment). I changed the caller code to account for that: https://github.com/deval-core/De-Val/pull/68/files/6e8da07badffeac501b426c2f19285ff18af1f5d#diff-274d3bc59fd308b41d1dcd439b1385875eb9dbf11c1dfe915c3f596c3907cb15R185-R192

but if you'd like it to be done in a different way I'll see what I can do

print("Mismatch between the model hash on the chain commit and the model hash on huggingface")
return False

Expand Down
6 changes: 3 additions & 3 deletions deval/model/huggingface_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ def get_hf_token()-> str:
return hf_token

@staticmethod
def pull_model_and_files(model_url: str) -> str:
def pull_model_and_files(model_url: str, download_dir: str | None = None) -> str:
hf_token = HuggingFaceModel.get_hf_token()
download_dir = f"/app/eval_llm"
download_dir = download_dir or "/app/eval_llm"

print(f"Beggining the download of model data at {model_url}")
local_dir = snapshot_download(
Expand Down Expand Up @@ -59,4 +59,4 @@ def query_hf_model(pipe: pipeline, request: EvalRequest) -> EvalResponse:
score = score,
mistakes = mistakes,
response_time = process_time,
)
)
4 changes: 3 additions & 1 deletion deval/rewards/relevance.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import time
import torch
from angle_emb import AnglE
Expand All @@ -16,8 +17,9 @@ def name(self) -> str:
def __init__(self, threshold=None, device=None, pooling_strategy="cls"):
super().__init__()
self.threshold = threshold
model_path = os.environ.get("ANGLE_MODEL_PATH", "WhereIsAI/UAE-Large-V1")
self.model = AnglE.from_pretrained(
"WhereIsAI/UAE-Large-V1", pooling_strategy=pooling_strategy, device=device
model_path, pooling_strategy=pooling_strategy, device=device
)
self.model.tokenizer._pad_token = self.model.tokenizer.pad_token
if device.startswith("cuda"):
Expand Down
11 changes: 7 additions & 4 deletions deval/task_repository.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from collections.abc import Iterator

from deval.tasks.task import TasksEnum, Task
from deval.llms.openai_llm import OpenAILLM
from deval.llms.bedrock_llm import AWSBedrockLLM
Expand Down Expand Up @@ -88,7 +90,7 @@

class TaskRepository:

def __init__(self, allowed_models: list[str] | None = None):
def __init__(self, allowed_models: list[str] | None = None, refresh_models_after_load: bool = True):
self.tasks: dict[TasksEnum, list[Task]] = {}

# initialize available models
Expand All @@ -97,6 +99,7 @@ def __init__(self, allowed_models: list[str] | None = None):
if allowed_models is not None:
self.supported_models = self.filter_to_allowed_models(allowed_models)

self.refresh_models_after_load = refresh_models_after_load
self.available_models = self.get_available_models()

def __getstate__(self):
Expand All @@ -106,7 +109,8 @@ def __getstate__(self):

def __setstate__(self, state):
self.__dict__.update(state)
self.available_models = self.get_available_models()
if self.refresh_models_after_load:
self.available_models = self.get_available_models()

def filter_to_allowed_models(self, allowed_models: list[str] | None) -> dict:
filtered_dict = {}
Expand Down Expand Up @@ -187,9 +191,8 @@ def generate_all_tasks(
self.tasks[task_name].append(task)
except:
continue


def get_all_tasks(self) -> Task:
def get_all_tasks(self) -> Iterator[tuple[str, list[Task]]]:
for task_name, tasks in self.tasks.items():
yield task_name, tasks

Expand Down
10 changes: 9 additions & 1 deletion deval/utils/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,6 +410,14 @@ def add_validator_args(cls, parser):
help="Max time to wait for a forward call to complete in seconds.",
default=64800,
)

parser.add_argument(
"--neuron.use_compute_horde",
action="store_true",
help="Enable the use of Compute Horde (default: False)",
default=False,
)



def config(cls):
Expand All @@ -422,4 +430,4 @@ def config(cls):
bt.logging.add_args(parser)
bt.axon.add_args(parser)
cls.add_args(parser)
return bt.config(parser)
return bt.config(parser)
Loading