first commit

svpino · svpino · commit 7ed007c26100 · 2024-06-18T12:09:12.000+02:00
diff --git a/.env b/.env
@@ -0,0 +1,5 @@
+LIVEKIT_URL=wss://assistant-fo94nojo.livekit.cloud
+LIVEKIT_API_KEY=APIKxvX5Ab5zkDn
+LIVEKIT_API_SECRET=BE9uqAxeTEyKTCrX04lHre6Y9lFKCge3N1WvDGfabeSA
+DEEPGRAM_API_KEY=f63176125b952fee10ecf06e0b2dc06b48672e6a
+OPENAI_API_KEY=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
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1 @@
+.venv
diff --git a/README.md b/README.md
@@ -0,0 +1,29 @@
+# LiveKit Assistant
+
+First, create a virtual environment, update pip, and install the required packages:
+
+```
+$ python3 -m venv .venv
+$ source .venv/bin/activate
+$ pip install -U pip
+$ pip install -r requirements.txt
+```
+
+You need to set up the following environment variables:
+
+```
+LIVEKIT_URL=...
+LIVEKIT_API_KEY=...
+LIVEKIT_API_SECRET=...
+DEEPGRAM_API_KEY=...
+OPENAI_API_KEY=...
+```
+
+Then, run the assistant:
+
+```
+$ python3 assistant.py download-files
+$ python3 assistant.py start
+```
+
+Finally, you can load the [hosted playground](https://agents-playground.livekit.io/).
diff --git a/assistant.py b/assistant.py
@@ -0,0 +1,141 @@
+import asyncio
+from typing import Annotated
+
+from livekit import agents, rtc
+from livekit.agents import JobContext, JobRequest, WorkerOptions, cli, tokenize, tts
+from livekit.agents.llm import (
+    ChatContext,
+    ChatImage,
+    ChatMessage,
+    ChatRole,
+)
+from livekit.agents.voice_assistant import AssistantContext, VoiceAssistant
+from livekit.plugins import deepgram, openai, silero
+
+
+class AssistantFunction(agents.llm.FunctionContext):
+    """This class is used to define functions that will be called by the assistant."""
+
+    @agents.llm.ai_callable(
+        desc=(
+            "Called when asked to evaluate something that would require vision capabilities,"
+            "for example, an image, video, or the webcam feed."
+        )
+    )
+    async def image(
+        self,
+        user_msg: Annotated[
+            str,
+            agents.llm.TypeInfo(desc="The user message that triggered this function"),
+        ],
+    ):
+        print(f"Message triggering vision capabilities: {user_msg}")
+        context = AssistantContext.get_current()
+        context.store_metadata("user_msg", user_msg)
+
+
+async def get_video_track(room: rtc.Room):
+    """Get the first video track from the room. We'll use this track to process images."""
+
+    video_track = asyncio.Future[rtc.RemoteVideoTrack]()
+
+    for _, participant in room.participants.items():
+        for _, track_publication in participant.tracks.items():
+            if track_publication.track is not None and isinstance(
+                track_publication.track, rtc.RemoteVideoTrack
+            ):
+                video_track.set_result(track_publication.track)
+                print(f"Using video track {track_publication.track.sid}")
+                break
+
+    return await video_track
+
+
+async def entrypoint(ctx: JobContext):
+    print(f"Room name: {ctx.room.name}")
+
+    chat_context = ChatContext(
+        messages=[
+            ChatMessage(
+                role=ChatRole.SYSTEM,
+                text=(
+                    "Your name is Alloy. You are a funny, witty bot. Your interface with users will be voice and vision."
+                    "Respond with short and concise answers. Avoid using unpronouncable punctuation or emojis."
+                ),
+            )
+        ]
+    )
+
+    gpt = openai.LLM(model="gpt-4o")
+
+    # Since OpenAI does not support streaming TTS, we'll use it with a StreamAdapter
+    # to make it compatible with the VoiceAssistant
+    openai_tts = tts.StreamAdapter(
+        tts=openai.TTS(voice="alloy"),
+        sentence_tokenizer=tokenize.basic.SentenceTokenizer(),
+    )
+
+    latest_image: rtc.VideoFrame | None = None
+
+    assistant = VoiceAssistant(
+        vad=silero.VAD(),  # We'll use Silero's Voice Activity Detector (VAD)
+        stt=deepgram.STT(),  # We'll use Deepgram's Speech To Text (STT)
+        llm=gpt,
+        tts=openai_tts,  # We'll use OpenAI's Text To Speech (TTS)
+        fnc_ctx=AssistantFunction(),
+        chat_ctx=chat_context,
+    )
+
+    chat = rtc.ChatManager(ctx.room)
+
+    async def _answer(text: str, use_image: bool = False):
+        """
+        Answer the user's message with the given text and optionally the latest
+        image captured from the video track.
+        """
+        args = {}
+        if use_image and latest_image:
+            args["images"] = [ChatImage(image=latest_image)]
+
+        chat_context.messages.append(ChatMessage(role=ChatRole.USER, text=text, **args))
+
+        stream = await gpt.chat(chat_context)
+        await assistant.say(stream, allow_interruptions=True)
+
+        await assistant.say(stream)
+
+    @chat.on("message_received")
+    def on_message_received(msg: rtc.ChatMessage):
+        """This event triggers whenever we get a new message from the user."""
+
+        if msg.message:
+            asyncio.create_task(_answer(msg.message, use_image=False))
+
+    @assistant.on("function_calls_finished")
+    def on_function_calls_finished(ctx: AssistantContext):
+        """This event triggers when an assistant's function call completes."""
+
+        user_msg = ctx.get_metadata("user_msg")
+        if user_msg:
+            asyncio.create_task(_answer(user_msg, use_image=True))
+
+    assistant.start(ctx.room)
+
+    await asyncio.sleep(1)
+    await assistant.say("Hi there! How can I help?", allow_interruptions=True)
+
+    while ctx.room.connection_state == rtc.ConnectionState.CONN_CONNECTED:
+        video_track = await get_video_track(ctx.room)
+
+        async for event in rtc.VideoStream(video_track):
+            # We'll continually grab the latest image from the video track
+            # and store it in a variable.
+            latest_image = event.frame
+
+
+async def request_fnc(req: JobRequest) -> None:
+    await req.accept(entrypoint)
+
+
+if __name__ == "__main__":
+    cli.run_app(WorkerOptions(request_fnc))
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,6 @@
+python-dotenv
+livekit-agents
+livekit-plugins-deepgram
+livekit-plugins-openai
+livekit-plugins-elevenlabs
+livekit-plugins-silero
diff --git a/sample.py b/sample.py
@@ -0,0 +1,159 @@
+import asyncio
+import copy
+import logging
+from collections import deque
+from typing import Annotated, List
+
+from livekit import agents, rtc
+from livekit.agents import JobContext, JobRequest, WorkerOptions, cli, tokenize, tts
+from livekit.agents.llm import (
+    ChatContext,
+    ChatMessage,
+    ChatRole,
+)
+from livekit.agents.voice_assistant import AssistantContext, VoiceAssistant
+from livekit.plugins import deepgram, openai, silero
+
+MAX_IMAGES = 3
+NO_IMAGE_MESSAGE_GENERIC = (
+    "I'm sorry, I don't have an image to process. Are you publishing your video?"
+)
+
+
+class AssistantFnc(agents.llm.FunctionContext):
+    @agents.llm.ai_callable(
+        desc="Called when asked to evaluate something that would require vision capabilities."
+    )
+    async def image(
+        self,
+        user_msg: Annotated[
+            str,
+            agents.llm.TypeInfo(desc="The user message that triggered this function"),
+        ],
+    ):
+        ctx = AssistantContext.get_current()
+        ctx.store_metadata("user_msg", user_msg)
+
+
+async def get_human_video_track(room: rtc.Room):
+    track_future = asyncio.Future[rtc.RemoteVideoTrack]()
+
+    def on_sub(track: rtc.Track, *_):
+        if isinstance(track, rtc.RemoteVideoTrack):
+            track_future.set_result(track)
+
+    room.on("track_subscribed", on_sub)
+
+    remote_video_tracks: List[rtc.RemoteVideoTrack] = []
+    for _, p in room.participants.items():
+        for _, t_pub in p.tracks.items():
+            if t_pub.track is not None and isinstance(
+                t_pub.track, rtc.RemoteVideoTrack
+            ):
+                remote_video_tracks.append(t_pub.track)
+
+    if len(remote_video_tracks) > 0:
+        track_future.set_result(remote_video_tracks[0])
+
+    video_track = await track_future
+    room.off("track_subscribed", on_sub)
+    return video_track
+
+
+async def entrypoint(ctx: JobContext):
+    sip = ctx.room.name.startswith("sip")
+    initial_ctx = ChatContext(
+        messages=[
+            ChatMessage(
+                role=ChatRole.SYSTEM,
+                text=(
+                    "You are a funny bot created by LiveKit. Your interface with users will be voice. "
+                    "You should use short and concise responses, and avoiding usage of unpronouncable punctuation."
+                ),
+            )
+        ]
+    )
+
+    gpt = openai.LLM(
+        model="gpt-4o",
+    )
+
+    # Since OpenAI does not support streaming TTS, we'll use it with a StreamAdapter
+    # to make it compatible with the VoiceAssistant
+    openai_tts = tts.StreamAdapter(
+        tts=openai.TTS(voice="alloy"),
+        sentence_tokenizer=tokenize.basic.SentenceTokenizer(),
+    )
+
+    latest_image: rtc.VideoFrame | None = None
+    img_msg_queue: deque[agents.llm.ChatMessage] = deque()
+    assistant = VoiceAssistant(
+        vad=silero.VAD(),
+        stt=deepgram.STT(),
+        llm=gpt,
+        tts=openai_tts,
+        fnc_ctx=None if sip else AssistantFnc(),
+        chat_ctx=initial_ctx,
+    )
+
+    chat = rtc.ChatManager(ctx.room)
+
+    async def _answer_from_text(text: str):
+        chat_ctx = copy.deepcopy(assistant.chat_context)
+        chat_ctx.messages.append(ChatMessage(role=ChatRole.USER, text=text))
+
+        stream = await gpt.chat(chat_ctx)
+        await assistant.say(stream)
+
+    @chat.on("message_received")
+    def on_chat_received(msg: rtc.ChatMessage):
+        if not msg.message:
+            return
+
+        asyncio.create_task(_answer_from_text(msg.message))
+
+    async def respond_to_image(user_msg: str):
+        nonlocal latest_image, img_msg_queue, initial_ctx
+        if not latest_image:
+            await assistant.say(NO_IMAGE_MESSAGE_GENERIC)
+            return
+
+        initial_ctx.messages.append(
+            agents.llm.ChatMessage(
+                role=agents.llm.ChatRole.USER,
+                text=user_msg,
+                images=[agents.llm.ChatImage(image=latest_image)],
+            )
+        )
+        img_msg_queue.append(initial_ctx.messages[-1])
+        if len(img_msg_queue) >= MAX_IMAGES:
+            msg = img_msg_queue.popleft()
+            msg.images = []
+
+        stream = await gpt.chat(initial_ctx)
+        await assistant.say(stream, allow_interruptions=True)
+
+    @assistant.on("function_calls_finished")
+    def _function_calls_done(ctx: AssistantContext):
+        user_msg = ctx.get_metadata("user_msg")
+        if not user_msg:
+            return
+        asyncio.ensure_future(respond_to_image(user_msg))
+
+    assistant.start(ctx.room)
+
+    await asyncio.sleep(0.5)
+    await assistant.say("Hey, how can I help you today?", allow_interruptions=True)
+    while ctx.room.connection_state == rtc.ConnectionState.CONN_CONNECTED:
+        video_track = await get_human_video_track(ctx.room)
+        async for event in rtc.VideoStream(video_track):
+            latest_image = event.frame
+
+
+async def request_fnc(req: JobRequest) -> None:
+    logging.info("received request %s", req)
+    await req.accept(entrypoint)
+
+
+if __name__ == "__main__":
+    cli.run_app(WorkerOptions(request_fnc))