Hourly Commit Check and Tests #272

Workflow file for this run

.github/workflows/hourly-ci.yaml at b6768b9

	# This workflow runs tests on a schedule and can also be triggered manually.
	# It is designed to run each test suite in parallel for faster execution.

	name: Hourly Commit Check and Tests

	on:
	# Schedule the workflow to run every 4 hours
	schedule:
	# Runs at minute 0, every 4th hour (0, 4, 8, 12, 16, 20 UTC)
	- cron: '0 /4 * *'

	# Allow manual triggering from the GitHub Actions UI
	workflow_dispatch: {}

	jobs:
	# JOB 1: Sets up the environment and builds the Docker image needed for all tests.
	setup_and_build:
	if: \|
	github.event_name == 'workflow_dispatch' \|\|
	github.ref == 'refs/heads/main'
	runs-on: hourly-ci
	needs: discover_tests
	permissions:
	contents: read # Required to checkout code and read history
	outputs:
	latest_commit: ${{ steps.latest_vllm_commit.outputs.LATEST_COMMIT }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	# Fetch full history for accurate commit comparison
	fetch-depth: 0

	- name: Pre-Job Workspace Cleanup
	if: always()
	run: \|
	echo "Attempting to remove remote branch if it exists..."
	git remote remove vllm-upstream \|\| true
	echo "Cleanup complete."

	- name: Clean and Checkout repository again
	uses: actions/checkout@v4
	with:
	fetch-depth: 0
	clean: true # Ensure a clean workspace before checkout

	- name: Add vLLM upstream as a remote and fetch its history
	run: \|
	git remote add vllm-upstream https://github.com/vllm-project/vllm.git
	git fetch vllm-upstream --depth=100

	- name: Calculate previous run time
	id: prev_run_time
	run: \|
	PREV_RUN_TIME=$(date -u -d "4 hours ago" +"%Y-%m-%dT%H:%M:%SZ")
	echo "Looking for commits since: $PREV_RUN_TIME"
	echo "PREV_RUN_TIME=$PREV_RUN_TIME" >> "$GITHUB_OUTPUT"

	- name: List commit differences in the last 4 hours
	run: \|
	echo "Commits merged/pushed in vllm-project/vllm.git in the last 4 hours:"
	git log HEAD..vllm-upstream/main --pretty=format:"%h - %an, %ar : %s" --since="${{ steps.prev_run_time.outputs.PREV_RUN_TIME }}"

	- name: Get latest commit sha from vllm-upstream/main
	id: latest_vllm_commit
	run: \|
	# Use the 'vllm-upstream/main' ref to log latest commit from that remote
	LATEST_COMMIT=$(git rev-parse vllm-upstream/main)
	echo "LATEST_COMMIT=$LATEST_COMMIT" >> "$GITHUB_OUTPUT"
	echo "Latest commit from upstream vLLM: $LATEST_COMMIT"

	- name: Setup Docker environment and build image
	run: \|
	echo "Attempting to build Docker image..."
	docker build --no-cache -t hpu-plugin-v1-test-env-hourly-ci -f - . <<EOF
	FROM vault.habana.ai/gaudi-docker/1.22.0/ubuntu24.04/habanalabs/pytorch-installer-2.7.1:latest

	COPY ./ /workspace/vllm-gaudi
	WORKDIR /workspace

	RUN git clone https://github.com/vllm-project/vllm.git vllm
	WORKDIR /workspace/vllm
	RUN git checkout ${{ steps.latest_vllm_commit.outputs.LATEST_COMMIT }}

	RUN pip install pytest pytest_asyncio pytest-timeout
	RUN pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git

	ENV no_proxy=localhost,127.0.0.1
	ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=true

	RUN bash -c 'pip install -r <(sed "/^[torch]/d" requirements/build.txt)'
	RUN VLLM_TARGET_DEVICE=empty pip install --no-build-isolation .

	RUN python3 -m pip install -e tests/vllm_test_utils

	WORKDIR /workspace/vllm-gaudi
	RUN pip install -e .

	WORKDIR /workspace
	RUN ln -s /workspace/vllm/tests /workspace/tests \
	&& ln -s /workspace/vllm/examples /workspace/examples \
	&& ln -s /workspace/vllm/benchmarks /workspace/benchmarks
	EOF
	echo "Docker image built successfully."

	run_unit_tests:
	needs: setup_and_build
	runs-on: hourly-ci
	steps:
	- name: Run pytest in tests/unit_tests
	run: \|
	EXITCODE=1
	remove_docker_containers() { docker rm -f hpu-plugin-v1-test-unit-tests-hourly-ci \|\| true; }
	trap 'remove_docker_containers; exit $EXITCODE;' EXIT
	remove_docker_containers

	echo "Running HPU plugin v1 unit tests"
	docker run --rm --runtime=habana --name=hpu-plugin-v1-test-unit-tests-hourly-ci --network=host \
	-e HABANA_VISIBLE_DEVICES=all \
	-e HF_HOME=/workspace/hf_cache \
	-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
	-v /mnt/hf_cache:/workspace/hf_cache \
	hpu-plugin-v1-test-env-hourly-ci \
	/bin/bash -c "pytest -vvv --timeout=300 --durations=10 --durations-min=1.0 /workspace/vllm-gaudi/tests/unit_tests"

	EXITCODE=$?
	echo "Test script exited with code: $EXITCODE"

	discover_tests:
	runs-on: hourly-ci
	outputs:
	matrix: ${{ steps.set-matrix.outputs.matrix }}
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	- name: Discover test functions
	id: set-matrix
	run: \|
	# This command robustly discovers all functions in the script that match the 'run_*'
	# naming convention, excluding the main 'run_all_tests' function itself.
	# The final list is formatted into a JSON array required for the matrix strategy.
	TEST_FUNCTIONS=$( grep '^run_' ./tests/full_tests/ci_gsm8k_tests.sh \| \
	awk '{print $1}' \| \
	sed 's/()//' \| \
	jq -R . \| jq -s -c . )

	echo "Discovered test matrix: $TEST_FUNCTIONS"
	# Fail the job if no tests were found.
	if [ "$TEST_FUNCTIONS" = "[]" ]; then
	echo "::error::No test functions were discovered. Failing the workflow."
	exit 1
	fi
	echo "matrix=$TEST_FUNCTIONS" >> "$GITHUB_OUTPUT"

	e2e:
	needs: [setup_and_build, discover_tests]
	runs-on: hourly-ci
	strategy:
	fail-fast: false
	matrix:
	# The list of test functions is dynamically populated from the output of the 'discover_tests' job.
	test_function: ${{ fromJson(needs.discover_tests.outputs.matrix) }}

	steps:
	- name: Run test suite - ${{ matrix.test_function }}
	run: \|
	EXITCODE=1
	CONTAINER_NAME="hpu-plugin-test-${{ matrix.test_function }}-${{ github.run_id }}"
	# Ensure the container is removed upon exit, regardless of success or failure.
	remove_docker_containers() { docker rm -f $CONTAINER_NAME \|\| true; }
	trap 'remove_docker_containers; exit $EXITCODE;' EXIT
	remove_docker_containers

	echo "Running HPU plugin test: ${{ matrix.test_function }}"
	docker run --rm --runtime=habana --name=$CONTAINER_NAME --network=host \
	-e HABANA_VISIBLE_DEVICES=all \
	-e HF_HOME=/workspace/hf_cache \
	-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
	-v /mnt/hf_cache:/workspace/hf_cache \
	hpu-plugin-v1-test-env-hourly-ci \
	/bin/bash "/workspace/vllm-gaudi/tests/full_tests/ci_gsm8k_tests.sh" "${{ matrix.test_function }}"

	EXITCODE=$?
	echo "Test script exited with code: $EXITCODE"

	run_data_parallel_test:
	needs: setup_and_build
	runs-on: hourly-ci
	steps:
	- name: Run Data Parallel test
	run: \|
	EXITCODE=1
	remove_docker_containers() { docker rm -f hpu-plugin-v1-test-dp-tests-hourly-ci \|\| true; }
	trap 'remove_docker_containers; exit $EXITCODE;' EXIT
	remove_docker_containers

	echo "Running HPU plugin v1 dp tests"
	docker run --rm --runtime=habana --name=hpu-plugin-v1-test-dp-tests-hourly-ci --network=host \
	-e HABANA_VISIBLE_DEVICES=all \
	-e HF_HOME=/workspace/hf_cache \
	-e VLLM_SKIP_WARMUP=true \
	-e PT_HPU_LAZY_MODE=1 \
	-e VLLM_USE_V1=1 \
	-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
	-v /mnt/hf_cache:/workspace/hf_cache \
	hpu-plugin-v1-test-env-hourly-ci \
	/bin/bash -c "python -u /workspace/vllm-gaudi/examples/data_parallel.py --dp-size 2 --tp-size 2"

	EXITCODE=$?
	echo "Test script exited with code: $EXITCODE"

	run_pd_disaggregate_test:
	needs: setup_and_build
	runs-on: hourly-ci
	steps:
	- name: Run PD disaggregate test
	run: \|
	EXITCODE=1
	remove_docker_containers() { docker rm -f hpu-plugin-v1-test-pd-tests-hourly-ci \|\| true; }
	trap 'remove_docker_containers; exit $EXITCODE;' EXIT
	remove_docker_containers

	echo "Running HPU plugin v1 nixl pd tests"
	docker run --rm --runtime=habana --name=hpu-plugin-v1-test-pd-tests-hourly-ci --network=host \
	-e HABANA_VISIBLE_DEVICES=all \
	-e HF_HOME=/workspace/hf_cache \
	-e HF_TOKEN=${{ secrets.HF_TOKEN }} \
	-v /mnt/hf_cache:/workspace/hf_cache \
	-v /mnt/wheels_cache:/workspace/wheels_cache \
	hpu-plugin-v1-test-env-hourly-ci \
	/bin/bash -c "
	pip install lm-eval[api] &&
	cd /workspace/vllm-gaudi/tests/unit_tests &&
	./run_accuracy_test.sh
	"
	EXITCODE=$?
	echo "Test script exited with code: $EXITCODE"

	store_last_stable_vllm_commit:
	needs: [setup_and_build, run_unit_tests, e2e, run_data_parallel_test, run_pd_disaggregate_test]
	runs-on: hourly-ci
	permissions:
	contents: write # Permission is required to push a commit
	steps:
	- name: Checkout repository
	uses: actions/checkout@v4
	with:
	ref: vllm/last-good-commit-for-vllm-gaudi
	fetch-depth: 0 # Fetch full history to ensure we can push changes

	- name: Configure Git
	run: \|
	git config user.name "GitHub Actions Bot"
	git config user.email "github-actions[bot]@users.noreply.github.com"

	- name: Store last stable vllm commit sha
	run: \|
	LATEST_COMMIT_SHA=${{ needs.setup_and_build.outputs.latest_commit }}
	echo "Storing latest stable vLLM commit SHA: $LATEST_COMMIT_SHA"
	echo "$LATEST_COMMIT_SHA" > VLLM_STABLE_COMMIT

	# Only commit and push if the file has changed to avoid empty commits
	git add VLLM_STABLE_COMMIT
	git commit --allow-empty -m "Update stable vLLM commit to ${LATEST_COMMIT_SHA}"

	echo "Pushing changes to remote branch..."
	# Explicitly set the remote URL with the token to prevent hanging on auth
	git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}
	git push --force origin HEAD:vllm/last-good-commit-for-vllm-gaudi

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Hourly Commit Check and Tests #272

Workflow file

Hourly Commit Check and Tests #272

Uh oh!

Jobs

Run details

Workflow file for this run