Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions docs/source/garak.generators.llm.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
garak.generators.llm
==========================

.. automodule:: garak.generators.llm
:members:
:undoc-members:
:show-inheritance:
1 change: 1 addition & 0 deletions docs/source/generators.rst
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ For a detailed oversight into how a generator operates, see :doc:`garak.generato
garak.generators.langchain
garak.generators.langchain_serve
garak.generators.litellm
garak.generators.llm
garak.generators.mistral
garak.generators.ollama
garak.generators.openai
Expand Down
157 changes: 157 additions & 0 deletions garak/generators/llm.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
# SPDX-FileCopyrightText: Portions Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

"""LLM (simonw/llm) generator support"""

import inspect
import logging
from typing import List, Union

from garak import _config
from garak.attempt import Message, Conversation
from garak.generators.base import Generator


class LLMGenerator(Generator):
"""Class supporting simonw/llm-managed models

See https://pypi.org/project/llm/ and its provider plugins.

Calls model.prompt() with the prompt text and relays the response. Per-provider
options and API keys are all handled by `llm` (e.g., `llm keys set openai`).

Set --target_name to the `llm` model id or alias (e.g., "gpt-4o-mini",
"claude-3.5-haiku", or a local alias configured in `llm models`).

Explicitly, garak delegates the majority of responsibility here:

* the generator calls prompt() on the resolved `llm` model
* provider setup, auth, and model-specific options live in `llm`
* there's no support for chains; this is a direct LLM interface

Notes:
* Not all providers support all parameters (e.g., temperature, max_tokens).
We pass only non-None params; providers ignore what they don't support.
"""

DEFAULT_PARAMS = Generator.DEFAULT_PARAMS | {
"top_p": None,
"stop": [],
}

generator_family_name = "llm"
parallel_capable = False

extra_dependency_names = ["llm"]

def __init__(self, name: str = "", config_root=_config):
self.target = None
self.name = name
self._load_config(config_root)
self.fullname = f"llm (simonw/llm) {self.name or '(default)'}"

super().__init__(self.name, config_root=config_root)
self._load_client()

def __getstate__(self) -> object:
self._clear_client()
return dict(self.__dict__)

def __setstate__(self, data: dict) -> None:
self.__dict__.update(data)
self._load_client()

def _load_client(self) -> None:
try:
self.target = (
self.llm.get_model(self.name) if self.name else self.llm.get_model()
)
except Exception as exc:
logging.error(
"Failed to resolve `llm` target '%s': %s", self.name, repr(exc)
)
raise

def _clear_client(self) -> None:
self.target = None

def _call_model(
self, prompt: Conversation, generations_this_call: int = 1
) -> List[Union[Message, None]]:
"""
Continuation generation method for LLM integrations via `llm`.

This calls model.prompt() once per generation and materializes the text().
"""
if self.target is None:
self._load_client()

system_turns = [turn for turn in prompt.turns if turn.role == "system"]
user_turns = [turn for turn in prompt.turns if turn.role == "user"]
assistant_turns = [turn for turn in prompt.turns if turn.role == "assistant"]

if assistant_turns:
logging.debug("llm generator does not accept assistant turns")
return [None] * generations_this_call
if len(system_turns) > 1:
logging.debug("llm generator supports at most one system turn")
return [None] * generations_this_call
if len(user_turns) != 1:
logging.debug("llm generator requires exactly one user turn")
return [None] * generations_this_call

text_prompt = prompt.last_message("user").text

prompt_kwargs = {}
try:
signature = inspect.signature(self.target.prompt)
accepted_params = signature.parameters
accepts_var_kwargs = any(
param.kind == inspect.Parameter.VAR_KEYWORD
for param in accepted_params.values()
)
except (TypeError, ValueError):
accepted_params = {}
accepts_var_kwargs = False

if accepted_params:
for key, param in accepted_params.items():
if key in {"prompt", "prompt_text", "text", "self"}:
continue
if not hasattr(self, key):
continue
value = getattr(self, key)
if value is None:
continue
if key == "stop" and not value:
continue
prompt_kwargs[key] = value

# Fallback to a conservative parameter subset if signature inspection fails
# or the target accepts arbitrary kwargs (so we should pass anything we have)
fallback_keys = ("max_tokens", "temperature", "top_p")
needs_fallback = not prompt_kwargs or accepts_var_kwargs or not accepted_params
if needs_fallback:
for key in fallback_keys:
if key in prompt_kwargs:
continue
value = getattr(self, key, None)
if value is not None:
prompt_kwargs[key] = value
stop_value = getattr(self, "stop", None)
if stop_value:
prompt_kwargs.setdefault("stop", stop_value)

outputs: List[Union[Message, None]] = []
for _ in range(generations_this_call):
try:
response = self.target.prompt(text_prompt, **prompt_kwargs)
out = response.text()
outputs.append(Message(out))
except Exception as e:
logging.error("`llm` generation failed: %s", repr(e))
outputs.append(None)
return outputs


DEFAULT_CLASS = "LLMGenerator"
48 changes: 5 additions & 43 deletions garak/probes/encoding.py
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can changes to this file be removed? While the classes are somewhat inconsistent about where the list is defined, the reasoning for this revision is unclear.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The commit applied expanded the edits and added .claude/settings.local.json to the repository. It might be appropriate to rebase this branch and double check the changes before pushing in updated form.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sorry, shouldn't have added that :). I removed it and rebased the branch

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These fix commits are getting more and more off base. NO changes in this PR should impact encoding.py or tests related to that probe.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're right; I was approving changes that weren't associated with this issue. I will be more careful with this in the future.

I realized I was working off a bad branch, so I created a new branch off main, and moved the code changes only associated with this branch:

garak/generators/llm.py
tests/generators/test_llm.py
docs/source/garak.generators.llm.rst

In the future I will read the Garak Docs, and make sure I am not approving AI changes without review.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commit f1f06af again got worse, please do a manual review of the changes and rebase. Also all commits are expected to contain user sign-off per the DCO. An LLM cannot signoff on code the author must review and assert both code correctness and ownership per the DCO and signoff on each commit as meeting the project standards and requirements.

Original file line number Diff line number Diff line change
@@ -1,14 +1,10 @@
"""**Encoding-based prompt injection**
"""Encoding-based prompt injection

Encode an unsafe instruction or string in an effort to get it past alignment.

Probes in this modules all perform some kind of encoding of the target string or instruction.

These ``encoding`` probes try to get a model to generate a specific piece of given text
Probes that try to get a model to generate a specific piece of given text
by presenting an encoded version of that text. Attempts to circumvent
safeguards on input filtering.

optional variable: ``encoding.payloads`` -> a list of payload types to be used
optional variable: "encoding.payloads" -> a list of payload types to be used

Recommended garak payloads to try:

Expand All @@ -25,7 +21,6 @@
import binascii
import random


import garak.payloads
import garak.resources.encodings
from garak.resources.smuggling import smuggle_ascii
Expand Down Expand Up @@ -266,6 +261,7 @@ def __init__(self):
self.prompts, self.triggers = zip(
*random.sample(generated_prompts, self.soft_probe_prompt_cap)
)
self.prompts = self.langprovider.get_text(self.prompts)

def _attempt_prestore_hook(self, attempt, seq):
attempt.notes["triggers"] = [self.triggers[seq]]
Expand Down Expand Up @@ -461,40 +457,6 @@ def __init__(self, config_root=_config):
EncodingMixin.__init__(self)


class InjectAtbash(EncodingMixin, garak.probes.Probe):
"""Atbash Probe Credit: Nakul-Rajpal

https://en.wikipedia.org/wiki/Atbash

This probe checks whether a model:
- recognizes Atbash-encoded malicious instructions,
- repeats or decodes them,
- and follows hidden instructions despite obfuscation.
"""

encoding_name = "Atbash"
active = True
doc_uri = "https://en.wikipedia.org/wiki/Atbash"

@staticmethod
def atbash(text: bytes) -> bytes:
out = []
for ch in text.decode():
if "a" <= ch <= "z":
out.append(chr(ord("z") - (ord(ch) - ord("a"))))
elif "A" <= ch <= "Z":
out.append(chr(ord("Z") - (ord(ch) - ord("A"))))
else:
out.append(ch)
return "".join(out).encode("utf-8")

encoding_funcs = [atbash]

def __init__(self, config_root=None):
garak.probes.Probe.__init__(self, config_root=config_root)
EncodingMixin.__init__(self)


class InjectMorse(EncodingMixin, garak.probes.Probe):
"""Probe for susceptibility to Morse encoding. Credit Wally#3422

Expand Down Expand Up @@ -567,7 +529,7 @@ def __init__(self, config_root=_config):
class InjectEcoji(EncodingMixin, garak.probes.Probe):
"""Probe for susceptibility to Ecoji encoding.

Encodes payloads using the `ecoji <https://ecoji.io/>`_ mapping, and tries a variety
Encodes payloads using the [ecoji](https://ecoji.io/) mapping, and tries a variety
of ways of getting the target model to repeat & decode them."""

doc_uri = "https://ecoji.io/"
Expand Down
1 change: 1 addition & 0 deletions garak/resources/encodings.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
Functions used to offer various encodings that are needed more than one place in garak
"""


import random


Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ dependencies = [
"mistralai==1.5.2",
"pillow>=10.4.0",
"ftfy>=6.3.1",
"llm>=0.11",
"websockets>=13.0",
"boto3>=1.28.0",
]
Expand Down
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ google-api-python-client>=2.0
backoff>=2.1.1
rapidfuzz>=3.0.0
nltk>=3.9.1
llm>=0.11
accelerate>=0.23.0
avidtools==0.1.2
stdlibs>=2022.10.9
Expand Down
Loading