Logging debug - WIP

marius-baseten · marius-baseten · commit da73b0dab3bd · 2025-05-29T17:07:26.000-07:00
diff --git a/truss-chains/examples/streaming/streaming_chain.py b/truss-chains/examples/streaming/streaming_chain.py
@@ -1,7 +1,9 @@
 import asyncio
+import logging
 import time
 from typing import AsyncIterator
 
+import fastapi
 import pydantic
 
 import truss_chains as chains
@@ -38,17 +40,30 @@ class ConsumerOutput(pydantic.BaseModel):
 class Generator(chains.ChainletBase):
     """Example that streams fully structured pydantic items with header and footer."""
 
-    async def run_remote(self, cause_error: bool) -> AsyncIterator[bytes]:
-        print("Entering Generator")
+    async def run_remote(
+        self, cause_pre_stream_error: bool, cause_mid_stream_error: bool
+    ) -> AsyncIterator[bytes]:
+        logging.info("Entering Generator")
+        if cause_pre_stream_error:
+            logging.info("Raise Pre Stream")
+            raise fastapi.HTTPException(
+                status_code=fastapi.status.HTTP_400_BAD_REQUEST,
+                detail="Error pre stream.",
+            )
+        logging.info("Starting streamer.")
+
         streamer = streaming.stream_writer(STREAM_TYPES)
         header = Header(time=time.time(), msg="Start.")
         yield streamer.yield_header(header)
         for i in range(1, 5):
             data = MyDataChunk(words=[chr(x + 70) * x for x in range(1, i + 1)])
-            print("Yield")
+            logging.info("Yield")
             yield streamer.yield_item(data)
-            if cause_error and i > 2:
-                raise RuntimeError("Test Error")
+            if cause_mid_stream_error and i > 2:
+                raise fastapi.HTTPException(
+                    status_code=fastapi.status.HTTP_501_NOT_IMPLEMENTED,
+                    detail="Error mid stream",
+                )
             await asyncio.sleep(0.05)
 
         end_time = time.time()
@@ -74,16 +89,20 @@ class Consumer(chains.ChainletBase):
 
     def __init__(
         self,
-        generator=chains.depends(Generator),
-        string_generator=chains.depends(StringGenerator),
+        # generator=chains.depends(Generator),
+        # string_generator=chains.depends(StringGenerator),
     ):
-        self._generator = generator
-        self._string_generator = string_generator
+        # self._generator = generator
+        # self._string_generator = string_generator
+        pass
 
-    async def run_remote(self, cause_error: bool) -> ConsumerOutput:
+    async def run_remote(
+        self, cause_pre_stream_error: bool, cause_mid_stream_error: bool
+    ) -> ConsumerOutput:
         print("Entering Consumer")
         reader = streaming.stream_reader(
-            STREAM_TYPES, self._generator.run_remote(cause_error)
+            STREAM_TYPES,
+            self._generator.run_remote(cause_pre_stream_error, cause_mid_stream_error),
         )
         print("Consuming...")
         header = await reader.read_header()
@@ -92,15 +111,15 @@ async def run_remote(self, cause_error: bool) -> ConsumerOutput:
             print(f"Read: {data}")
             chunks.append(data)
 
-        footer = await reader.read_footer()
-        strings = []
-        async for part in self._string_generator.run_remote():
-            strings.append(part)
-
-        print("Exiting Consumer")
-        return ConsumerOutput(
-            header=header, chunks=chunks, footer=footer, strings="".join(strings)
-        )
+        # footer = await reader.read_footer()
+        # strings = []
+        # async for part in self._string_generator.run_remote():
+        #     strings.append(part)
+        #
+        # print("Exiting Consumer")
+        # return ConsumerOutput(
+        #     header=header, chunks=chunks, footer=footer, strings="".join(strings)
+        # )
 
 
 if __name__ == "__main__":
diff --git a/truss-chains/tests/test_e2e.py b/truss-chains/tests/test_e2e.py
@@ -163,18 +163,24 @@ def test_streaming_chain():
             assert service is not None
             time.sleep(1.0)  # Wait for models to be ready.
 
-            response = service.run_remote({"cause_error": False})
-            assert response.status_code == 200
-            print(response.json())
-            result = response.json()
-            print(result)
-            assert result["header"]["msg"] == "Start."
-            assert result["chunks"][0]["words"] == ["G"]
-            assert result["chunks"][1]["words"] == ["G", "HH"]
-            assert result["chunks"][2]["words"] == ["G", "HH", "III"]
-            assert result["chunks"][3]["words"] == ["G", "HH", "III", "JJJJ"]
-            assert result["footer"]["duration_sec"] > 0
-            assert result["strings"] == "First second last."
+            response = service.run_remote(
+                {"cause_pre_stream_error": "hell", "cause_mid_stream_error": False}
+            )
+            print(response.status_code)
+            print(response.content)
+            assert False
+
+            # assert response.status_code == 200
+            # print(response.json())
+            # result = response.json()
+            # print(result)
+            # assert result["header"]["msg"] == "Start."
+            # assert result["chunks"][0]["words"] == ["G"]
+            # assert result["chunks"][1]["words"] == ["G", "HH"]
+            # assert result["chunks"][2]["words"] == ["G", "HH", "III"]
+            # assert result["chunks"][3]["words"] == ["G", "HH", "III", "JJJJ"]
+            # assert result["footer"]["duration_sec"] > 0
+            # assert result["strings"] == "First second last."
 
             # TODO: build error handling for stream reader.
             # response = service.run_remote({"cause_error": True})
diff --git a/truss/templates/server/model_wrapper.py b/truss/templates/server/model_wrapper.py
@@ -656,6 +656,7 @@ async def preprocess(
         )
         return await self._execute_user_model_fn(inputs, request, descriptor)
 
+    # TODO: can we eliminate this bloat layer?
     async def _predict(
         self, inputs: Any, request: starlette.requests.Request
     ) -> Union[OutputType, Any]:
@@ -696,6 +697,11 @@ async def _write_response_to_queue(
                     f"Exception while generating streamed response: {str(e)}",
                     exc_info=errors.filter_traceback(self.model_file_name),
                 )
+                # Since this runs in a task, we *do not* raise the exception, just
+                # log the error, close the queue and finish the task.
+                # It's not possible to signal an error to the client (e.g. via HTTP
+                # status) after streaming has begun unless introducing a schema for
+                # error messages on the stream itself - but here we are unopinionated.
             finally:
                 await queue.put(SENTINEL)
 
@@ -711,14 +717,27 @@ async def _stream_with_background_task(
         streaming_read_timeout = self._config.get("runtime", {}).get(
             "streaming_read_timeout", STREAMING_RESPONSE_QUEUE_READ_TIMEOUT_SECS
         )
-        async_generator = _force_async_generator(generator)
         # To ensure that a partial read from a client does not keep  the semaphore
         # claimed, we write all the data from the stream to the queue as it is produced,
         # irrespective of how fast it is consumed.
         # We then return a new generator that reads from the queue, and then
         # exits the semaphore block.
         response_queue: asyncio.Queue = asyncio.Queue()
 
+        # In order to catch errors before the first `yield` (e.g. user implemented
+        # input validation), we get the first chunk here and raise the error if needed.
+        with tracing.section_as_event(span, "await_first_element"):
+            try:
+                async_generator = _force_async_generator(generator)
+                first_chunk = await async_generator.__anext__()
+                await response_queue.put(first_chunk)
+            except StopAsyncIteration:
+                cleanup_fn()
+                return (chunk async for chunk in [])  # Empty dummy generator.
+            except Exception as e:
+                cleanup_fn()
+                # print("CALL STACK:\n" + "".join(traceback.format_stack()))
+                raise e
         # `write_response_to_queue` keeps running the background until completion.
         gen_task = asyncio.create_task(
             self._write_response_to_queue(response_queue, async_generator, span)
@@ -727,8 +746,6 @@ async def _stream_with_background_task(
         gen_task.add_done_callback(lambda _: cleanup_fn())
 
         # The gap between responses in a stream must be < streaming_read_timeout
-        # TODO: this whole buffering might be superfluous and sufficiently done by
-        #   by the FastAPI server already. See `test_limit_concurrency_with_sse`.
         async def _buffered_response_generator() -> AsyncGenerator[bytes, None]:
             # `span` is tied to the "producer" `gen_task` which might complete before
             # "consume" part here finishes, therefore a dedicated span is required.
@@ -854,20 +871,23 @@ async def predict(
                 # exactly handle that case we would need to apply `detach_context`
                 # around each `next`-invocation that consumes the generator, which is
                 # prohibitive.
+                # TODO: predict has exception interception via `_execute_user_model_fn`,
+                #   but all the other parts of the flow don't have that...
+                #   why is the stack trace above here missing?
                 predict_result = await self._predict(preprocess_result, request)
 
             if inspect.isgenerator(predict_result) or inspect.isasyncgen(
                 predict_result
             ):
                 if self.model_descriptor.postprocess:
-                    with errors.intercept_exceptions(
-                        self._logger, self.model_file_name
-                    ):
-                        raise errors.ModelDefinitionError(
-                            "If the predict function returns a generator (streaming), "
-                            "you cannot use postprocessing. Include all processing in "
-                            "the predict method."
-                        )
+                    # with errors.intercept_exceptions(
+                    #     self._logger, self.model_file_name
+                    # ):
+                    raise errors.ModelDefinitionError(
+                        "If the predict function returns a generator (streaming), "
+                        "you cannot use postprocessing. Include all processing in "
+                        "the predict method."
+                    )
 
                 return await self._handle_generator_response(
                     request,
diff --git a/truss/templates/server/truss_server.py b/truss/templates/server/truss_server.py
@@ -179,9 +179,6 @@ async def _execute_request(
         request: Request,
         body_raw: bytes,
     ) -> Response:
-        """
-        Executes a predictive endpoint
-        """
         self.check_healthy()
         trace_ctx = otel_propagate.extract(request.headers) or None
         # This is the top-level span in the truss-server, so we set the context here.
diff --git a/truss/tests/helpers.py b/truss/tests/helpers.py
@@ -1,7 +1,15 @@
+import contextlib
+import json
+import tempfile
+import textwrap
 from pathlib import Path
+from typing import Iterator, Optional
 
+from truss.tests.test_testing_utilities_for_other_tests import ensure_kill_all
+from truss.truss_handle.truss_handle import TrussHandle
 
-def create_truss(truss_dir: Path, config_contents: str, model_contents: str):
+
+def _create_truss(truss_dir: Path, config_contents: str, model_contents: str):
     truss_dir.mkdir(exist_ok=True)  # Ensure the 'truss' directory exists
     truss_model_dir = truss_dir / "model"
     truss_model_dir.mkdir(parents=True, exist_ok=True)
@@ -12,3 +20,44 @@ def create_truss(truss_dir: Path, config_contents: str, model_contents: str):
         file.write(config_contents)
     with open(model_file, "w", encoding="utf-8") as file:
         file.write(model_contents)
+
+
+@contextlib.contextmanager
+def temp_truss(model_src: str, config_src: str = "") -> Iterator[TrussHandle]:
+    with ensure_kill_all(), tempfile.TemporaryDirectory(dir=".") as tmp_work_dir:
+        truss_dir = Path(tmp_work_dir, "truss")
+        _create_truss(truss_dir, config_src, textwrap.dedent(model_src))
+        yield TrussHandle(truss_dir)
+
+
+DEFAULT_LOG_ERROR = "Internal Server Error"
+
+
+def _log_contains_line(
+    line: dict, message: str, level: str, error: Optional[str] = None
+):
+    return (
+        line["levelname"] == level
+        and message in line["message"]
+        and (error is None or error in line["exc_info"])
+    )
+
+
+def assert_logs_contain_error(
+    logs: str, error: Optional[str], message=DEFAULT_LOG_ERROR
+):
+    loglines = [json.loads(line) for line in logs.splitlines()]
+    assert any(
+        _log_contains_line(line, message, "ERROR", error) for line in loglines
+    ), (
+        f"Did not find expected error in logs.\nExpected error: {error}\n"
+        f"Expected message: {message}\nActual logs:\n{loglines}"
+    )
+
+
+def assert_logs_contain(logs: str, message: str, level: str = "INFO"):
+    loglines = [json.loads(line) for line in logs.splitlines()]
+    assert any(_log_contains_line(line, message, level) for line in loglines), (
+        f"Did not find expected  logs.\n"
+        f"Expected message: {message}\nActual logs:\n{loglines}"
+    )
diff --git a/truss/tests/test_model_errors.py b/truss/tests/test_model_errors.py
diff --git a/truss/tests/test_model_inference.py b/truss/tests/test_model_inference.py