Skip to content

Commit a5a95cf

Browse files
jstouracopenshift-merge-bot[bot]
authored andcommitted
RHOAIENG-20088: GHA CI enhancement: check software manifests against images
This adds a new check for the consistency of the information in the ImageStream manifest files. We now check that the software versions info in these files truly matches what is installed in the referenced images. More information is tracked in: * https://issues.redhat.com/browse/RHOAIENG-20569
1 parent ee82b75 commit a5a95cf

File tree

2 files changed

+368
-0
lines changed

2 files changed

+368
-0
lines changed
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
---
2+
name: Validation of software versions referenced in ImageStream manifests
3+
on: # yamllint disable-line rule:truthy
4+
push:
5+
# Since this check is so resource demanding, let's restrict also push action here.
6+
paths:
7+
- 'manifests/base/params.env'
8+
- 'manifests/base/*-imagestream.yaml'
9+
- 'ci/check-software-versions.py'
10+
pull_request:
11+
paths:
12+
- 'manifests/base/params.env'
13+
- 'manifests/base/*-imagestream.yaml'
14+
- 'ci/check-software-versions.py'
15+
workflow_dispatch:
16+
release:
17+
18+
permissions:
19+
contents: read
20+
21+
jobs:
22+
validation-of-sw-versions-in-imagestreams:
23+
runs-on: ubuntu-latest
24+
env:
25+
# Some pieces of code (image pulls for example) in podman consult TMPDIR or default to /var/tmp
26+
TMPDIR: /home/runner/.local/share/containers/tmpdir
27+
steps:
28+
- uses: actions/checkout@v4
29+
30+
- name: Mount lvm overlay for podman operations
31+
run: |
32+
df -h
33+
free -h
34+
35+
bash ./ci/cached-builds/gha_lvm_overlay.sh
36+
37+
df -h
38+
free -h
39+
40+
- name: Configure Podman
41+
run: |
42+
set -Eeuxo pipefail
43+
44+
sudo cp ci/cached-builds/storage.conf /etc/containers/storage.conf
45+
46+
# should reset storage when changing storage.conf
47+
sudo mkdir -p $HOME/.local/share/containers/storage/tmp
48+
# remote (CONTAINER_HOST) podman does not do reset (and refuses --force option)
49+
sudo podman system reset --force
50+
# podman running as service ignores the TMPDIR env var here, let's give it a bind-mount to /var/tmp
51+
sudo mkdir -p $TMPDIR
52+
sudo mount --bind -o rw,noexec,nosuid,nodev,bind $TMPDIR /var/tmp
53+
54+
# quick check that podman works
55+
sudo podman info
56+
57+
- name: Check software versions in manifest referencing the images
58+
id: software-versions-check
59+
run: |
60+
sudo ./ci/check-software-versions.py --prune-podman-data

ci/check-software-versions.py

Lines changed: 308 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
#!/usr/bin/python3
2+
#
3+
# This script iterates over the ImageStreams in our manifest files and for each image version
4+
# there it checks the given information about expected installed software with the actual
5+
# reality of each such image.
6+
#
7+
# Usage:
8+
# python ./ci/check-software-versions.py
9+
#
10+
# The script is expected to be executed from the root directory of this repository.
11+
#
12+
13+
import argparse
14+
import json
15+
import logging
16+
import os
17+
import re
18+
import subprocess
19+
import uuid
20+
21+
import yaml
22+
23+
from enum import Enum
24+
25+
# Path to the file with image references to the image registry
26+
PARAMS_ENV_PATH = "manifests/base/params.env"
27+
28+
class ANNOTATION_TYPE(Enum):
29+
SOFTWARE = "software"
30+
PYTHON_DEPS = "python-deps"
31+
32+
logging.basicConfig(
33+
level=logging.INFO,
34+
format="%(asctime)s - %(levelname)s - %(message)s",
35+
datefmt="%Y-%m-%d %H:%M:%S",
36+
)
37+
38+
log = logging.getLogger(__name__)
39+
prune_podman_data = False
40+
41+
def find_imagestream_files(directory="."):
42+
"""Finds all ImageStream YAML files in the given directory and its subdirectories."""
43+
44+
imagestreams = []
45+
for root, _, files in os.walk(directory):
46+
for file in files:
47+
if file.endswith("-imagestream.yaml") and not file.startswith("runtime-"):
48+
imagestreams.append(os.path.join(root, file))
49+
imagestreams.sort()
50+
return imagestreams
51+
52+
def load_yaml(filepath):
53+
"""Loads and parses a YAML file."""
54+
55+
try:
56+
with open(filepath, "r") as f:
57+
return yaml.safe_load(f)
58+
except (FileNotFoundError, yaml.YAMLError) as e:
59+
log.error(f"Loading YAML from '{filepath}': {e}")
60+
return None
61+
62+
def extract_variable(reference):
63+
"""Extracts a variable name from a string (e.g.: '$(odh-rstudio-notebook-image-commit-n-1)') using regex."""
64+
65+
match = re.search(r"\((.*?)\)", reference)
66+
return match.group(1) if match else None
67+
68+
def get_variable_value(variable_name, params_file_path=PARAMS_ENV_PATH):
69+
"""Retrieves the value of a variable from a parameters file."""
70+
71+
try:
72+
with open(params_file_path, "r") as params_file:
73+
for line in params_file:
74+
if variable_name in line:
75+
return line.split("=")[1].strip()
76+
log.error(f"Variable '{variable_name}' not found in '{params_file_path}'!")
77+
return None
78+
except FileNotFoundError:
79+
log.error(f"'{params_file_path}' not found!")
80+
return None
81+
except Exception as e:
82+
log.error(f"An unexpected error occurred: {e}")
83+
return None
84+
85+
def run_podman_container(image_name, image_link, detach=True):
86+
"""Runs a Podman container in detached mode and returns the container ID."""
87+
88+
try:
89+
if prune_podman_data:
90+
# Since we're pruning the data, we're probably interested about current disk space usage.
91+
subprocess.run(["df", "-h"], check=True)
92+
container_name = f"tmp-{image_name}-{uuid.uuid4()}"
93+
result = subprocess.run(["podman", "run", "-d", "--name", container_name, image_link], capture_output=True, text=True, check=True)
94+
container_id = result.stdout.strip()
95+
log.info(f"Container '{container_id}' started (detached).")
96+
return container_id
97+
except (subprocess.CalledProcessError, FileNotFoundError, Exception) as e:
98+
log.error(f"Error running Podman container '{image_link}': {e}")
99+
return None
100+
101+
def execute_command_in_container(container_id, command):
102+
"""Executes a command inside a running Podman container."""
103+
104+
try:
105+
result = subprocess.run(["podman", "exec", container_id] + command, capture_output=True, text=True, check=True)
106+
log.debug(result.stdout.strip())
107+
return result.stdout.strip()
108+
except (subprocess.CalledProcessError, Exception) as e:
109+
log.error(f"Error executing command '{command}' in container '{container_id}': {e}")
110+
return None
111+
112+
def stop_and_remove_container(container_id):
113+
"""Stops and removes a Podman container."""
114+
115+
if not container_id:
116+
log.error(f"Given undefined value in 'container_id' argument!")
117+
return 1
118+
try:
119+
subprocess.run(["podman", "stop", container_id], check=True)
120+
subprocess.run(["podman", "rm", container_id], check=True)
121+
if prune_podman_data:
122+
subprocess.run(["podman", "system", "prune", "--all", "--force"], check=True)
123+
log.info(f"Container {container_id} stopped and removed.")
124+
except (subprocess.CalledProcessError, Exception) as e:
125+
log.error(f"Error stopping/removing container '{container_id}': {e}")
126+
return 1
127+
128+
return 0
129+
130+
def parse_json_string(json_string):
131+
"""Parses a JSON string and returns the data as a list of dictionaries."""
132+
133+
try:
134+
return json.loads(json_string)
135+
except (json.JSONDecodeError, Exception) as e:
136+
log.error(f"Error parsing JSON: {e}")
137+
return None
138+
139+
def process_dependency_item(item, container_id, annotation_type):
140+
"""Processes a single item (dictionary) from the JSON data."""
141+
142+
name, version = item.get("name"), item.get("version")
143+
if not name or not version:
144+
log.error(f"Missing name or version in item: {item}")
145+
return 1
146+
147+
log.info(f"Checking {name} (version {version}) in container...")
148+
149+
command_mapping = {
150+
"PyTorch": ["/bin/bash", "-c", f"pip show torch | grep 'Version: '"],
151+
"ROCm": ["/bin/bash", "-c", "rpm -q --queryformat '%{VERSION}\n' rocm"],
152+
"ROCm-PyTorch": ["/bin/bash", "-c", "pip show torch | grep 'Version: ' | grep rocm"],
153+
"ROCm-TensorFlow": ["/bin/bash", "-c", "pip show tensorflow-rocm | grep 'Version: '"],
154+
"TensorFlow": ["/bin/bash", "-c", "pip show tensorflow | grep 'Version: '"],
155+
"R": ["/bin/bash", "-c", "R --version"],
156+
"rstudio-server": ["/bin/bash", "-c", "rpm -q --queryformat '%{VERSION}\n' rstudio-server"],
157+
"Sklearn-onnx": ["/bin/bash", "-c", "pip show skl2onnx | grep 'Version: '"],
158+
"MySQL Connector/Python": ["/bin/bash", "-c", "pip show mysql-connector-python | grep 'Version: '"],
159+
"Nvidia-CUDA-CU12-Bundle": ["/bin/bash", "-c", "pip show nvidia-cuda-runtime-cu12 | grep 'Version: '"],
160+
"Python": ["/bin/bash", "-c", "python --version"],
161+
"CUDA": ["/bin/bash", "-c", "nvcc --version"],
162+
}
163+
164+
command = command_mapping.get(name)
165+
if not command:
166+
if annotation_type == ANNOTATION_TYPE.SOFTWARE:
167+
command = ["/bin/bash", "-c", f"{name.lower()} --version"]
168+
else:
169+
command = ["/bin/bash", "-c", f"pip show {name.lower()} | grep 'Version: '"]
170+
171+
output = execute_command_in_container(container_id, command)
172+
173+
if output and version.lstrip('v') in output:
174+
log.info(f"{name} version check passed.")
175+
else:
176+
log.error(f"{name} version check failed. Expected '{version}', found '{output}'.")
177+
return 1
178+
179+
return 0
180+
181+
def process_tag(tag):
182+
ret_code = 0
183+
184+
tag_annotations = tag.get("annotations", {})
185+
186+
if "name" not in tag:
187+
log.error(f"Missing 'name' field for {tag}!")
188+
return 1
189+
190+
log.info(f"Processing tag: {tag['name']}.")
191+
outdated_annotation = "opendatahub.io/image-tag-outdated"
192+
if tag_annotations.get(outdated_annotation) == "true":
193+
log.info(f"Skipping processing of this tag as it is marked as outdated.")
194+
print_delimiter()
195+
return 0
196+
if "from" not in tag or "name" not in tag["from"]:
197+
log.error(f"Missing 'from.name' in tag {tag['name']}")
198+
return 1
199+
200+
image_ref = tag["from"]["name"]
201+
image_var = extract_variable(image_ref)
202+
image_val = get_variable_value(image_var)
203+
log.debug(f"Retrieved image link: '{image_val}'")
204+
205+
if not image_val:
206+
log.error(f"Failed to parse image value reference pointing by '{image_ref}'!")
207+
return 1
208+
209+
container_id = run_podman_container(image_var, image_val)
210+
if not container_id:
211+
log.error(f"Failed to start a container from image '{image_val}' for the '{image_ref}' tag!")
212+
return 1
213+
214+
ntb_sw_annotation = "opendatahub.io/notebook-software"
215+
python_dep_annotation = "opendatahub.io/notebook-python-dependencies"
216+
217+
try:
218+
software = tag_annotations.get(ntb_sw_annotation)
219+
if not software:
220+
log.error(f"Missing '{ntb_sw_annotation}' in ImageStream tag '{tag}'!")
221+
return 1
222+
223+
python_deps = tag_annotations.get(python_dep_annotation)
224+
if not python_deps:
225+
log.error(f"Missing '{python_dep_annotation}' in ImageStream tag '{tag}'!")
226+
return 1
227+
228+
for item in parse_json_string(software) or []:
229+
if process_dependency_item(item, container_id, ANNOTATION_TYPE.SOFTWARE) != 0:
230+
log.error(f"Failed check for the '{image_ref}' tag!")
231+
ret_code = 1
232+
233+
for item in parse_json_string(python_deps) or []:
234+
if process_dependency_item(item, container_id, ANNOTATION_TYPE.PYTHON_DEPS) != 0:
235+
log.error(f"Failed check for the '{image_ref}' tag!")
236+
ret_code = 1
237+
finally:
238+
if stop_and_remove_container(container_id) != 0:
239+
log.error(f"Failed to stop/remove the container '{container_id}' for the '{image_ref}' tag!")
240+
print_delimiter()
241+
return 1
242+
print_delimiter()
243+
244+
return ret_code
245+
246+
def process_imagestream(imagestream):
247+
"""Processes a single ImageStream file and check images that it is referencing."""
248+
249+
ret_code = 0
250+
log.info(f"Processing ImageStream: {imagestream}.")
251+
252+
yaml_data = load_yaml(imagestream)
253+
if not yaml_data or "spec" not in yaml_data or "tags" not in yaml_data["spec"]:
254+
log.error(f"Invalid YAML in {imagestream} as ImageStream file!")
255+
return 1
256+
257+
# Process each image version in the ImageStream:
258+
for tag in yaml_data["spec"]["tags"]:
259+
if process_tag(tag) != 0:
260+
log.error(f"Failed to process tag {tag} in ImageStream {imagestream}!")
261+
# Let's move on the next tag if any
262+
ret_code = 1
263+
continue
264+
265+
return ret_code
266+
267+
def print_delimiter():
268+
log.info("----------------------------------------------------------------------")
269+
log.info("")
270+
271+
def main():
272+
273+
parser = argparse.ArgumentParser(description="Process command-line arguments.")
274+
parser.add_argument("-p", "--prune-podman-data", action="store_true", help="Prune Podman data after each image is processed. This is useful when running in GHA workers.")
275+
276+
args = parser.parse_args()
277+
global prune_podman_data
278+
prune_podman_data = args.prune_podman_data
279+
280+
ret_code = 0
281+
log.info("Starting the check ImageStream software version references.")
282+
283+
imagestreams = find_imagestream_files()
284+
log.info("Following list of ImageStream manifests has been found:")
285+
for imagestream in imagestreams: log.info(imagestream)
286+
287+
if not imagestreams or len(imagestreams) == 0:
288+
log.error("Failed to detect any ImageStream manifest files!")
289+
exit(1)
290+
291+
print_delimiter()
292+
293+
for imagestream in imagestreams:
294+
if process_imagestream(imagestream) != 0:
295+
log.error(f"Failed to process {imagestream} ImageStream manifest file!")
296+
# Let's move on the next imagestream if any
297+
ret_code = 1
298+
continue
299+
300+
if ret_code == 0:
301+
log.info("The software versions check in manifests was successful. Congrats! :)")
302+
else:
303+
log.error("The software version check failed, see errors above in the log for more information!")
304+
305+
exit(ret_code)
306+
307+
if __name__ == "__main__":
308+
main()

0 commit comments

Comments
 (0)