Torch run console script
torchrun --nproc_per_node=NUM_GPUS_YOU_HAVE [Link] --arg1 value1 --arg2 value2
torchrun --nproc_per_node=8 [Link] \
--training_dir /home/ec2-user/SageMaker/9.1_EXP_fairscale/fairscale/train \
--test_dir /home/ec2-user/SageMaker/9.1_EXP_fairscale/fairscale/test \
--model_uri /home/ec2-user/SageMaker/9.1_EXP_fairscale/fairscale/model_uri \
--rank 0 \
--world_size 8
Deepspeed console script
deepspeed src/[Link] --num_gpus=8 --deepspeed_config=[Link]
deepspeed --num_gpus 8 training_1/src/train_bash.py \
--deepspeed examples/deepspeed/ds_z2_config.json \
--stage sft \
ECR login
aws ecr get-login --no-include-email --region "us-east-1" | awk '{print $6}' | docker login -u AWS --
password-stdin "[Link]"
Docker pull
docker pull [OPTIONS] IMAGE[:TAG|@DIGEST]
docker pull [Link]/sagemaker-training-containers/
isbank:llamafactory
Docker run
docker run [OPTIONS] IMAGE[:TAG|@DIGEST] [COMMAND] [ARG...]
docker run -it --name training_2 [Link]/sagemaker-
training-containers/isbank:llamafactory
Docker tag
docker tag SOURCE_IMAGE[:TAG] TARGET_IMAGE[:TAG]
docker tag cd91f33067a8 [Link]/sagemaker-training-
containers/isbank:llamafactory_2.0
Docker push
docker push username/my-image:latest
docker push [Link]/sagemaker-training-containers/
isbank:llamafactory_2.0
Docker rmi
docker rmi [OPTIONS] IMAGE [IMAGE...]
docker rmi [Link]/sagemaker-training-containers/
isbank:llamafactory
Docker interactive
docker exec -it <container ID> /bin/bash
mount volume for docker
docker run -v /home/ec2-user/SageMaker:/opt/ml -it [Link]-east-
[Link]/sagemaker-training-containers/isbank:llamafactory_2.0
Docker run all gpus
docker run --gpus all -v /home/ec2-user/SageMaker:/opt/ml -it [Link]-east-
[Link]/sagemaker-training-containers/isbank:llamafactory_2.0
Docker build
docker-compose build
docker-compose up
Monitor nvidia GPU devices
nvidia-smi
Install aws-cli
sudo apt install unzip
curl "[Link] -o "[Link]"
unzip [Link]
./aws/install --bin-dir ~/bin --install-dir ~/aws-cli –update
Download model s3 to local instance
/[Link]/aws s3 sync <source> <destination>
/[Link]/aws s3 sync s3://ish-bank-data/NewModel/Retriever/ /opt/ml/input/data/model
Cache cleanup
for cache "cd ~/.cache/"
rm -Rf ~/.cache/
Alembic
alembic heads
alembic revision -m ‘message’ ==> generates new migration/revision and assigns the metadata
alembic history ==> see complete log of applied migrations to the database.
alembic current ==> see the current head/revision
alembic upgrade head ==> to migrate the database to the head revision
alembic downgrade -1 ==> moves the database to previous revision
Git Commands
git fetch origin
git checkout main
git pull origin main
git checkout mybranch
git rebase main
alembic history
git push -u origin mybranch –force
git fetch –all
git config --global [Link] " "
git config --global [Link] " "
git add .