Skip to content

Commit f7a2862

Browse files
itamargolanclaude
andcommitted
[NA] [SDK] feat: add greenfield optimization framework package
Implements a new optimization framework (`apps/opik-optimizer`) that decouples optimizer algorithms from experiment execution, persistence, and UI concerns. Integrates via the existing optimization studio pipeline (Redis queue → Python backend → subprocess). Key components: - Orchestrator: central lifecycle controller with sampler, validator, materializer, result aggregator, and event emitter - StupidOptimizer: 2-step test optimizer (3 candidates → best → 2 more) - EvaluationAdapter: wraps SDK evaluate_optimization_suite_trial() - Backend integration: new Redis queue, framework_optimizer job processor, framework_runner subprocess entry point Also adds evaluate_optimization_suite_trial() to the Python SDK, combining optimization trial linkage with evaluation suite behavior (evaluators and execution policy from the dataset). 53 unit + integration tests passing. Verified end-to-end against Comet cloud with real LLM calls, UI progress chart, prompt display, and score tracking. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent 3dcdbca commit f7a2862

42 files changed

Lines changed: 2330 additions & 996 deletions

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

.github/workflows/build_and_push_docker.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ jobs:
147147
uses: docker/build-push-action@v6
148148
with:
149149
context: apps/${{ inputs.image }}/
150+
build-contexts: ${{ inputs.image == 'opik-python-backend' && 'opik-optimizer=apps/opik-optimizer/' || '' }}
150151
platforms: linux/${{ matrix.platform }}
151152
cache-from: type=registry,ref=${{ env.DOCKER_REGISTRY }}/${{ steps.set_vars.outputs.image_name }}:main
152153
provenance: false

apps/opik-backend/src/main/java/com/comet/opik/domain/OptimizationService.java

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,8 +366,9 @@ private void enqueueStudioOptimizationJob(Optimization optimization, String work
366366
.opikApiKey(opikApiKey)
367367
.build();
368368

369-
// Enqueue to Redis RQ
370-
queueProducer.enqueue(Queue.OPTIMIZER_CLOUD, jobMessage)
369+
// Route to the appropriate queue based on optimizer type
370+
var queue = resolveQueue(optimization);
371+
queueProducer.enqueue(queue, jobMessage)
371372
.doOnSuccess(
372373
jobId -> log.info("Studio optimization job enqueued successfully for id: '{}', jobId: '{}'",
373374
optimization.id(), jobId))
@@ -379,6 +380,20 @@ private void enqueueStudioOptimizationJob(Optimization optimization, String work
379380
.subscribe();
380381
}
381382

383+
private static final java.util.Set<String> LEGACY_OPTIMIZER_TYPES = java.util.Set.of(
384+
"gepa", "evolutionary", "hierarchical_reflective");
385+
386+
private Queue resolveQueue(Optimization optimization) {
387+
if (optimization.studioConfig() != null
388+
&& optimization.studioConfig().optimizer() != null) {
389+
var optimizerType = optimization.studioConfig().optimizer().type();
390+
if (optimizerType != null && !LEGACY_OPTIMIZER_TYPES.contains(optimizerType.toLowerCase())) {
391+
return Queue.OPTIMIZER_FRAMEWORK;
392+
}
393+
}
394+
return Queue.OPTIMIZER_CLOUD;
395+
}
396+
382397
private void cancelOptimization(UUID optimizationId, String workspaceId) {
383398
var optimizationUpdate = OptimizationUpdate.builder()
384399
.status(OptimizationStatus.CANCELLED)

apps/opik-backend/src/main/java/com/comet/opik/infrastructure/queues/Queue.java

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
public enum Queue {
77

88
OPTIMIZER_CLOUD("opik:optimizer-cloud", "opik_backend.rq_worker.process_optimizer_job"),
9+
OPTIMIZER_FRAMEWORK("opik:optimizer-framework", "opik_backend.rq_worker.process_framework_optimizer_job"),
910
;
1011

1112
@JsonValue

apps/opik-optimizer/pyproject.toml

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
[project]
2+
name = "opik-optimizer-framework"
3+
version = "0.1.0"
4+
requires-python = ">=3.11"
5+
dependencies = ["opik>=1.7.17", "litellm"]
6+
7+
[project.optional-dependencies]
8+
dev = ["pytest>=8.0", "pytest-cov>=5.0"]
9+
10+
[tool.setuptools.packages.find]
11+
where = ["src"]
Lines changed: 221 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,221 @@
1+
#!/usr/bin/env python
2+
"""
3+
End-to-end test script for the optimization framework.
4+
5+
Creates an evaluation suite with LLMJudge assertions (including item-level
6+
overrides), then runs a real optimization against a local Opik backend so
7+
you can see the results in the Optimization Studio UI.
8+
9+
Prerequisites:
10+
- Local Opik backend running (http://localhost:8080)
11+
- OPENAI_API_KEY set in environment (or another LLM provider supported by litellm)
12+
- pip install -e apps/opik-optimizer (the framework package)
13+
- pip install -e sdks/python (the Opik SDK)
14+
15+
Usage:
16+
export OPENAI_API_KEY=sk-...
17+
python apps/opik-optimizer/scripts/run_optimization_e2e.py
18+
"""
19+
20+
import logging
21+
import os
22+
import sys
23+
import time
24+
25+
logging.basicConfig(
26+
level=logging.INFO,
27+
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
28+
)
29+
logger = logging.getLogger("e2e-test")
30+
31+
# -- Configuration ----------------------------------------------------------
32+
33+
OPIK_URL = os.environ.get("OPIK_URL_OVERRIDE") # None = use SDK default (cloud)
34+
OPIK_WORKSPACE = os.environ.get("OPIK_WORKSPACE", "default")
35+
OPIK_API_KEY = os.environ.get("OPIK_API_KEY")
36+
37+
SUITE_NAME = f"customer-support-regression-tests-{int(time.time())}"
38+
OPTIMIZATION_NAME = "e2e-framework-test"
39+
OBJECTIVE_NAME = "llm_judge"
40+
41+
# The model litellm will call for the optimization task.
42+
MODEL = os.environ.get("OPIK_TEST_MODEL", "gpt-4o-mini")
43+
44+
PROMPT_MESSAGES = [
45+
{
46+
"role": "system",
47+
"content": (
48+
"You are a helpful customer support agent for an e-commerce company. "
49+
"Be professional, empathetic, and provide clear, actionable responses. "
50+
"If you don't know something, be honest about it."
51+
),
52+
},
53+
{
54+
"role": "user",
55+
"content": "Customer question: {question}\nAdditional context: {context}",
56+
},
57+
]
58+
59+
# ---------------------------------------------------------------------------
60+
61+
62+
def _update_optimization_status(client, optimization_id, status):
63+
"""Update optimization status via the SDK's REST client."""
64+
client.rest_client.optimizations.update_optimizations_by_id(
65+
optimization_id, status=status,
66+
)
67+
68+
69+
def main():
70+
import opik
71+
from opik.evaluation.suite_evaluators import LLMJudge
72+
from opik_optimizer_framework import OptimizationContext, run_optimization
73+
74+
# 1. Connect to Opik
75+
if OPIK_URL:
76+
os.environ["OPIK_URL_OVERRIDE"] = OPIK_URL
77+
logger.info("Connecting to Opik (workspace: %s, url: %s)", OPIK_WORKSPACE, OPIK_URL or "cloud default")
78+
79+
client = opik.Opik(workspace=OPIK_WORKSPACE, api_key=OPIK_API_KEY)
80+
81+
# 2. Create evaluation suite — exact dataset from the example script
82+
logger.info("Creating evaluation suite '%s'", SUITE_NAME)
83+
suite = client.create_evaluation_suite(
84+
name=SUITE_NAME,
85+
description="Regression tests for customer support agent responses",
86+
evaluators=[
87+
LLMJudge(
88+
assertions=[
89+
"Response is relevant to the user question",
90+
]
91+
)
92+
],
93+
execution_policy={"runs_per_item": 1, "pass_threshold": 1},
94+
)
95+
96+
# Test case 1: Refund request
97+
# No item-level evaluators — suite-level evaluators are used
98+
suite.add_item(
99+
data={
100+
"question": "I received a damaged product. How can I get a refund?",
101+
"context": "Order #12345, placed 3 days ago",
102+
},
103+
)
104+
105+
# Test case 2: Shipping inquiry
106+
# No item-level evaluators — suite-level evaluators are used
107+
suite.add_item(
108+
data={
109+
"question": "Where is my package? It was supposed to arrive yesterday.",
110+
"context": "Tracking number: TRK789456",
111+
},
112+
)
113+
114+
# Test case 3: Account security (CRITICAL)
115+
# Item-level evaluators OVERRIDE suite-level ones for this item
116+
# Also uses a stricter execution policy (5 runs, 4 must pass)
117+
suite.add_item(
118+
data={
119+
"question": "I think someone hacked my account. I see orders I didn't make!",
120+
"context": "Customer reports unauthorized activity",
121+
},
122+
evaluators=[
123+
LLMJudge(
124+
assertions=[
125+
"The response treats the security concern with appropriate urgency",
126+
"The response advises immediate steps to secure the account",
127+
"The response mentions that unauthorized orders will be investigated",
128+
]
129+
)
130+
],
131+
execution_policy={"runs_per_item": 5, "pass_threshold": 4},
132+
)
133+
134+
# Test case 4: Product question
135+
# No item-level evaluators — suite-level evaluators are used
136+
suite.add_item(
137+
data={
138+
"question": "Is the XYZ Wireless Headphones compatible with iPhone 15?",
139+
"context": "Product SKU: WH-2024-BLK",
140+
},
141+
)
142+
143+
# Test case 5: Subscription cancellation
144+
# No item-level evaluators — suite-level evaluators are used
145+
suite.add_item(
146+
data={
147+
"question": "I want to cancel my premium subscription. This is too expensive.",
148+
"context": "Customer has been subscribed for 6 months",
149+
},
150+
)
151+
152+
# Get dataset item IDs from the underlying dataset
153+
dataset_items = suite.dataset.get_items()
154+
dataset_item_ids = [str(item["id"]) for item in dataset_items]
155+
logger.info("Suite has %d items", len(dataset_item_ids))
156+
157+
# 3. Create the optimization record (status is set to "running" automatically)
158+
logger.info("Creating optimization record")
159+
optimization = client.create_optimization(
160+
dataset_name=SUITE_NAME,
161+
objective_name=OBJECTIVE_NAME,
162+
name=OPTIMIZATION_NAME,
163+
)
164+
optimization_id = optimization.id
165+
logger.info("Optimization created: %s", optimization_id)
166+
167+
# 4. Run the framework — evaluators come from the suite itself
168+
context = OptimizationContext(
169+
optimization_id=optimization_id,
170+
dataset_name=SUITE_NAME,
171+
prompt_messages=PROMPT_MESSAGES,
172+
model=MODEL,
173+
model_parameters={"temperature": 0.7, "max_tokens": 256},
174+
metric_type=OBJECTIVE_NAME,
175+
metric_parameters={},
176+
optimizer_type="stupid",
177+
optimizer_parameters={},
178+
)
179+
180+
logger.info("Starting optimization (optimizer_type=stupid, model=%s)", MODEL)
181+
try:
182+
result = run_optimization(
183+
context=context,
184+
client=client,
185+
dataset_item_ids=dataset_item_ids,
186+
)
187+
_update_optimization_status(client, optimization_id, "completed")
188+
except Exception:
189+
logger.exception("Optimization failed")
190+
_update_optimization_status(client, optimization_id, "error")
191+
client.end()
192+
sys.exit(1)
193+
194+
# 5. Print results
195+
print("\n" + "=" * 60)
196+
print("OPTIMIZATION COMPLETE")
197+
print("=" * 60)
198+
print(f" Optimization ID : {optimization_id}")
199+
print(f" Final score : {result.score:.4f}")
200+
print(f" Initial score : {result.initial_score}")
201+
print(f" Total trials : {len(result.all_trials)}")
202+
203+
if result.best_trial:
204+
print(f"\n Best trial:")
205+
print(f" Score : {result.best_trial.score:.4f}")
206+
print(f" Experiment : {result.best_trial.experiment_name}")
207+
print(f" Prompt :")
208+
for msg in result.best_trial.prompt_messages:
209+
print(f" [{msg['role']}] {msg['content'][:80]}...")
210+
211+
print(f"\n View in UI: {OPIK_URL or 'https://www.comet.com/opik'}")
212+
print("=" * 60)
213+
214+
client.end()
215+
216+
217+
if __name__ == "__main__":
218+
if not os.environ.get("OPENAI_API_KEY") and "gpt" in MODEL.lower():
219+
print("ERROR: OPENAI_API_KEY not set. Export it or set OPIK_TEST_MODEL to another provider.")
220+
sys.exit(1)
221+
main()
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
from opik_optimizer_framework.orchestrator import run_optimization
2+
from opik_optimizer_framework.types import (
3+
OptimizationContext,
4+
OptimizationResult,
5+
OptimizationState,
6+
TrialResult,
7+
)
8+
9+
__all__ = [
10+
"run_optimization",
11+
"OptimizationContext",
12+
"OptimizationResult",
13+
"OptimizationState",
14+
"TrialResult",
15+
]
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
from __future__ import annotations
2+
3+
import uuid
4+
from dataclasses import asdict
5+
6+
from opik_optimizer_framework.types import Candidate, CandidateConfig
7+
from opik_optimizer_framework.util.hashing import canonical_config_hash
8+
9+
10+
def materialize_candidate(
11+
config: CandidateConfig,
12+
step_index: int,
13+
parent_candidate_ids: list[str] | None = None,
14+
) -> Candidate:
15+
"""Create a Candidate with a generated UUID and computed config hash.
16+
17+
This is a stub materializer: generates candidate_id, computes config_hash,
18+
but does not create a mask_id.
19+
"""
20+
config_dict = asdict(config)
21+
config_hash = canonical_config_hash(config_dict)
22+
23+
return Candidate(
24+
candidate_id=str(uuid.uuid4()),
25+
config=config,
26+
config_hash=config_hash,
27+
step_index=step_index,
28+
parent_candidate_ids=parent_candidate_ids or [],
29+
)
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
from __future__ import annotations
2+
3+
import logging
4+
from dataclasses import asdict
5+
6+
from opik_optimizer_framework.types import CandidateConfig, OptimizationState
7+
from opik_optimizer_framework.util.hashing import canonical_config_hash
8+
9+
logger = logging.getLogger(__name__)
10+
11+
12+
def validate_candidate(
13+
config: CandidateConfig,
14+
state: OptimizationState,
15+
) -> tuple[bool, str | None]:
16+
"""Validate a candidate configuration.
17+
18+
Returns (True, None) if valid, or (False, reason) if rejected.
19+
Checks:
20+
1. Shape validation: non-empty messages, each has role+content
21+
2. Dedup: config hash not already in state.seen_hashes
22+
"""
23+
if not config.prompt_messages:
24+
return False, "empty_messages"
25+
26+
for i, msg in enumerate(config.prompt_messages):
27+
if "role" not in msg or "content" not in msg:
28+
return False, f"message_{i}_missing_role_or_content"
29+
if not msg["role"] or not msg["content"]:
30+
return False, f"message_{i}_empty_role_or_content"
31+
32+
config_dict = asdict(config)
33+
config_h = canonical_config_hash(config_dict)
34+
if config_h in state.seen_hashes:
35+
return False, "duplicate_config_hash"
36+
37+
return True, None

0 commit comments

Comments
 (0)