Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Python: draft initial implementation of Realtime API #10127

Draft
wants to merge 20 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
206 changes: 206 additions & 0 deletions docs/decisions/00XX-realtime-api-clients.md

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion python/.cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@
"SEMANTICKERNEL",
"OTEL",
"vectorizable",
"desync"
"desync",
"webrtc"
]
}
11 changes: 8 additions & 3 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ dependencies = [
"pybars4 ~= 0.9",
"jinja2 ~= 3.1",
"nest-asyncio ~= 1.6",
"taskgroup >= 0.2.2; python_version < '3.11'",
]

### Optional dependencies
Expand All @@ -61,7 +62,8 @@ chroma = [
]
google = [
"google-cloud-aiplatform ~= 1.60",
"google-generativeai ~= 0.7"
"google-generativeai ~= 0.7",
"google-genai ~= 0.4"
]
hugging_face = [
"transformers[torch] ~= 4.28",
Expand Down Expand Up @@ -123,6 +125,11 @@ dapr = [
"dapr-ext-fastapi>=1.14.0",
"flask-dapr>=1.14.0"
]
openai_realtime = [
"openai[realtime] ~= 1.0",
"aiortc>=1.9.0",
"sounddevice>=0.5.1",
]

[tool.uv]
prerelease = "if-necessary-or-explicit"
Expand Down Expand Up @@ -220,5 +227,3 @@ name = "semantic_kernel"
[build-system]
requires = ["flit-core >= 3.9,<4.0"]
build-backend = "flit_core.buildapi"


178 changes: 178 additions & 0 deletions python/samples/concepts/audio/04-chat_with_realtime_api.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio
import logging
from datetime import datetime
from random import randint

from semantic_kernel import Kernel
from semantic_kernel.connectors.ai import FunctionChoiceBehavior
from semantic_kernel.connectors.ai.open_ai import (
ListenEvents,
OpenAIRealtime,
OpenAIRealtimeExecutionSettings,
TurnDetection,
)
from semantic_kernel.connectors.ai.realtime_client_base import RealtimeClientBase
from semantic_kernel.connectors.ai.utils import SKAudioPlayer
from semantic_kernel.contents import ChatHistory, StreamingChatMessageContent
from semantic_kernel.functions import kernel_function

logging.basicConfig(level=logging.WARNING)
aiortc_log = logging.getLogger("aiortc")
aiortc_log.setLevel(logging.WARNING)
aioice_log = logging.getLogger("aioice")
aioice_log.setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# This simple sample demonstrates how to use the OpenAI Realtime API to create
# a chat bot that can listen and respond directly through audio.
# It requires installing:
# - semantic-kernel[openai_realtime]
# - pyaudio
# - sounddevice
# - pydub
# - aiortc
# e.g. pip install pyaudio sounddevice pydub

# The characterics of your speaker and microphone are a big factor in a smooth conversation
# so you may need to try out different devices for each.
# you can also play around with the turn_detection settings to get the best results.
# It has device id's set in the AudioRecorderStream and AudioPlayerAsync classes,
# so you may need to adjust these for your system.
# you can check the available devices by uncommenting line below the function


def check_audio_devices():
import sounddevice as sd

logger.debug(sd.query_devices())


check_audio_devices()


class ReceivingStreamHandler:
"""This is a simple class that listens to the received buffer of the RealtimeClientBase.

It can be used to play audio and print the transcript of the conversation.

It can also be used to act on other events from the service.
"""

def __init__(self, realtime_client: RealtimeClientBase, audio_player: SKAudioPlayer | None = None):
self.audio_player = audio_player
self.realtime_client = realtime_client

async def listen(
self,
play_audio: bool = True,
print_transcript: bool = True,
) -> None:
# print the start message of the transcript
if print_transcript:
print("Mosscap (transcript): ", end="")
try:
# start listening for events
while True:
event_type, event = await self.realtime_client.receive_buffer.get()
match event_type:
case ListenEvents.RESPONSE_AUDIO_DELTA:
if play_audio and self.audio_player and isinstance(event, StreamingChatMessageContent):
await self.audio_player.add_audio(event.items[0])
case ListenEvents.RESPONSE_AUDIO_TRANSCRIPT_DELTA:
if print_transcript and isinstance(event, StreamingChatMessageContent):
print(event.content, end="")
case ListenEvents.RESPONSE_CREATED:
if print_transcript:
print("")
# case ....:
# # add other event handling here
await asyncio.sleep(0.01)
except asyncio.CancelledError:
print("\nThanks for talking to Mosscap!")


@kernel_function
def get_weather(location: str) -> str:
"""Get the weather for a location."""
weather_conditions = ("sunny", "hot", "cloudy", "raining", "freezing", "snowing")
weather = weather_conditions[randint(0, len(weather_conditions) - 1)] # nosec
logger.info(f"Getting weather for {location}: {weather}")
return f"The weather in {location} is {weather}."


@kernel_function
def get_date_time() -> str:
"""Get the current date and time."""
logger.info("Getting current datetime")
return f"The current date and time is {datetime.now().isoformat()}."


async def main() -> None:
# create the Kernel and add a simple function for function calling.
kernel = Kernel()
kernel.add_function(plugin_name="weather", function_name="get_weather", function=get_weather)
kernel.add_function(plugin_name="time", function_name="get_date_time", function=get_date_time)

# create the realtime client and optionally add the audio output function, this is optional
audio_player = SKAudioPlayer()
# you can define the protocol to use, either "websocket" or "webrtc"
# they will behave the same way, even though the underlying protocol is quite different
realtime_client = OpenAIRealtime(protocol="webrtc", audio_output_callback=audio_player.client_callback)

# create stream receiver (defined above), this can play the audio,
# if the audio_player is passed (commented out here)
# and allows you to print the transcript of the conversation
# and review or act on other events from the service
stream_handler = ReceivingStreamHandler(realtime_client) # SimplePlayer(device_id=None)

# Create the settings for the session
# The realtime api, does not use a system message, but takes instructions as a parameter for a session
instructions = """
You are a chat bot. Your name is Mosscap and
you have one goal: figure out what people need.
Your full name, should you need to know it, is
Splendid Speckled Mosscap. You communicate
effectively, but you tend to answer with long
flowery prose.
"""
# the key thing to decide on is to enable the server_vad turn detection
# if turn is turned off (by setting turn_detection=None), you will have to send
# the "input_audio_buffer.commit" and "response.create" event to the realtime api
# to signal the end of the user's turn and start the response.
# manual VAD is not part of this sample
settings = OpenAIRealtimeExecutionSettings(
instructions=instructions,
voice="alloy",
turn_detection=TurnDetection(type="server_vad", create_response=True, silence_duration_ms=800, threshold=0.8),
function_choice_behavior=FunctionChoiceBehavior.Auto(),
)
# and we can add a chat history to conversation after starting it
chat_history = ChatHistory()
chat_history.add_user_message("Hi there, who are you?")
chat_history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")

# the context manager calls the create_session method on the client and start listening to the audio stream
async with realtime_client, audio_player:
await realtime_client.update_session(
settings=settings, chat_history=chat_history, kernel=kernel, create_response=True
)
# you can also send other events to the service, like this (the first has content, the second does not)
# await realtime_client.send(
# SendEvents.CONVERSATION_ITEM_CREATE,
# item=ChatMessageContent(role="user", content="Hi there, who are you?")},
# )
# await realtime_client.send(SendEvents.RESPONSE_CREATE)
async with asyncio.TaskGroup() as tg:
tg.create_task(realtime_client.start_streaming())
tg.create_task(stream_handler.listen())


if __name__ == "__main__":
print(
"Instruction: start speaking, when you stop the API should detect you finished and start responding. "
"Press ctrl + c to stop the program."
)
asyncio.run(main())
Original file line number Diff line number Diff line change
Expand Up @@ -276,7 +276,9 @@ async def get_streaming_chat_message_contents(
for msg in messages:
if msg is not None:
all_messages.append(msg)
if any(isinstance(item, FunctionCallContent) for item in msg.items):
if not function_call_returned and any(
isinstance(item, FunctionCallContent) for item in msg.items
):
function_call_returned = True
yield messages

Expand Down Expand Up @@ -442,7 +444,10 @@ def _get_ai_model_id(self, settings: "PromptExecutionSettings") -> str:
return getattr(settings, "ai_model_id", self.ai_model_id) or self.ai_model_id

def _yield_function_result_messages(self, function_result_messages: list) -> bool:
"""Determine if the function result messages should be yielded."""
"""Determine if the function result messages should be yielded.

If there are messages and if the first message has items, then yield the messages.
"""
return len(function_result_messages) > 0 and len(function_result_messages[0].items) > 0

# endregion
50 changes: 50 additions & 0 deletions python/semantic_kernel/connectors/ai/function_calling_utils.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
# Copyright (c) Microsoft. All rights reserved.

from collections import OrderedDict
from collections.abc import Callable
from copy import deepcopy
from typing import TYPE_CHECKING, Any

from semantic_kernel.contents.chat_message_content import ChatMessageContent
from semantic_kernel.contents.function_result_content import FunctionResultContent
from semantic_kernel.contents.streaming_chat_message_content import StreamingChatMessageContent
from semantic_kernel.contents.utils.author_role import AuthorRole
from semantic_kernel.exceptions.service_exceptions import ServiceInitializationError
from semantic_kernel.utils.experimental_decorator import experimental_function

if TYPE_CHECKING:
from semantic_kernel.connectors.ai.function_choice_behavior import (
Expand All @@ -16,6 +19,7 @@
)
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.functions.kernel_function_metadata import KernelFunctionMetadata
from semantic_kernel.kernel import Kernel


def update_settings_from_function_call_configuration(
Expand Down Expand Up @@ -129,3 +133,49 @@ def merge_streaming_function_results(
function_invoke_attempt=function_invoke_attempt,
)
]


@experimental_function
def prepare_settings_for_function_calling(
settings: "PromptExecutionSettings",
settings_class: type["PromptExecutionSettings"],
update_settings_callback: Callable[..., None],
kernel: "Kernel",
) -> "PromptExecutionSettings":
"""Prepare settings for the service.

Args:
settings: Prompt execution settings.
settings_class: The settings class.
update_settings_callback: The callback to update the settings.
kernel: Kernel instance.

Returns:
PromptExecutionSettings of type settings_class.
"""
settings = deepcopy(settings)
if not isinstance(settings, settings_class):
settings = settings_class.from_prompt_execution_settings(settings)

# For backwards compatibility we need to convert the `FunctionCallBehavior` to `FunctionChoiceBehavior`
# if this method is called with a `FunctionCallBehavior` object as part of the settings

from semantic_kernel.connectors.ai.function_call_behavior import FunctionCallBehavior
from semantic_kernel.connectors.ai.function_choice_behavior import FunctionChoiceBehavior

if hasattr(settings, "function_call_behavior") and isinstance(
settings.function_call_behavior, FunctionCallBehavior
):
settings.function_choice_behavior = FunctionChoiceBehavior.from_function_call_behavior(
settings.function_call_behavior
)

if settings.function_choice_behavior:
# Configure the function choice behavior into the settings object
# that will become part of the request to the AI service
settings.function_choice_behavior.configure(
kernel=kernel,
update_settings_callback=update_settings_callback,
settings=settings,
)
return settings
11 changes: 11 additions & 0 deletions python/semantic_kernel/connectors/ai/open_ai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,10 @@
OpenAIPromptExecutionSettings,
OpenAITextPromptExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_realtime_execution_settings import (
OpenAIRealtimeExecutionSettings,
TurnDetection,
)
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_text_to_audio_execution_settings import (
OpenAITextToAudioExecutionSettings,
)
Expand All @@ -36,10 +40,12 @@
from semantic_kernel.connectors.ai.open_ai.services.azure_text_to_image import AzureTextToImage
from semantic_kernel.connectors.ai.open_ai.services.open_ai_audio_to_text import OpenAIAudioToText
from semantic_kernel.connectors.ai.open_ai.services.open_ai_chat_completion import OpenAIChatCompletion
from semantic_kernel.connectors.ai.open_ai.services.open_ai_realtime import OpenAIRealtime
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_completion import OpenAITextCompletion
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_embedding import OpenAITextEmbedding
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_audio import OpenAITextToAudio
from semantic_kernel.connectors.ai.open_ai.services.open_ai_text_to_image import OpenAITextToImage
from semantic_kernel.connectors.ai.open_ai.services.realtime.const import ListenEvents, SendEvents
from semantic_kernel.connectors.ai.open_ai.settings.azure_open_ai_settings import AzureOpenAISettings
from semantic_kernel.connectors.ai.open_ai.settings.open_ai_settings import OpenAISettings

Expand All @@ -63,12 +69,15 @@
"DataSourceFieldsMapping",
"DataSourceFieldsMapping",
"ExtraBody",
"ListenEvents",
"OpenAIAudioToText",
"OpenAIAudioToTextExecutionSettings",
"OpenAIChatCompletion",
"OpenAIChatPromptExecutionSettings",
"OpenAIEmbeddingPromptExecutionSettings",
"OpenAIPromptExecutionSettings",
"OpenAIRealtime",
"OpenAIRealtimeExecutionSettings",
"OpenAISettings",
"OpenAITextCompletion",
"OpenAITextEmbedding",
Expand All @@ -77,4 +86,6 @@
"OpenAITextToAudioExecutionSettings",
"OpenAITextToImage",
"OpenAITextToImageExecutionSettings",
"SendEvents",
"TurnDetection",
]
Loading
Loading