Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion eng/packages/General.props
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
<PackageVersion Include="Microsoft.Extensions.VectorData.Abstractions" Version="$(MicrosoftExtensionsVectorDataAbstractionsVersion)" />
<PackageVersion Include="Microsoft.ML.Tokenizers" Version="$(MicrosoftMLTokenizersVersion)" />
<PackageVersion Include="ModelContextProtocol.Core" Version="1.2.0" />
<PackageVersion Include="OpenAI" Version="2.9.1" />
<PackageVersion Include="OpenAI" Version="2.10.0" />
<PackageVersion Include="Polly" Version="8.4.2" />
<PackageVersion Include="Polly.Core" Version="8.4.2" />
<PackageVersion Include="Polly.Extensions" Version="8.4.2" />
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
using System.Diagnostics;
using System.Diagnostics.CodeAnalysis;
using System.Linq;
using System.Net.Mime;
using System.Reflection;
using System.Runtime.CompilerServices;
using System.Text;
Expand Down Expand Up @@ -292,6 +291,14 @@ internal static IEnumerable<ChatMessage> ToChatMessages(IEnumerable<ResponseItem
message.Contents.Add(new ToolResultContent(computerCallOutput.CallId) { RawRepresentation = computerCallOutput });
break;

case ApplyPatchCallItem patchCall:
message.Contents.Add(new ToolCallContent(patchCall.CallId) { RawRepresentation = patchCall });
break;

case ApplyPatchCallOutputItem patchCallOutput:
message.Contents.Add(new ToolResultContent(patchCallOutput.CallId) { RawRepresentation = patchCallOutput });
break;

default:
message.Contents.Add(new() { RawRepresentation = outputItem });
break;
Expand Down Expand Up @@ -1088,12 +1095,16 @@ internal static IEnumerable<ResponseItem> ToOpenAIResponseItems(IEnumerable<Chat
(parts ??= []).Add(ResponseContentPart.CreateInputImagePart(uriContent.Uri, GetImageDetail(item)));
break;

case UriContent uriContent when uriContent.MediaType.StartsWith("application/pdf", StringComparison.OrdinalIgnoreCase):
(parts ??= []).Add(ResponseContentPart.CreateInputFilePart(uriContent.Uri));
break;

case DataContent dataContent when dataContent.HasTopLevelMediaType("image"):
(parts ??= []).Add(ResponseContentPart.CreateInputImagePart(new Uri(dataContent.Uri), GetImageDetail(item)));
(parts ??= []).Add(ResponseContentPart.CreateInputImagePart(BinaryData.FromBytes(dataContent.Data, dataContent.MediaType), GetImageDetail(item)));
break;

case DataContent dataContent when dataContent.MediaType.StartsWith("application/pdf", StringComparison.OrdinalIgnoreCase):
(parts ??= []).Add(ResponseContentPart.CreateInputFilePart(BinaryData.FromBytes(dataContent.Data), dataContent.MediaType, dataContent.Name ?? $"{Guid.NewGuid():N}.pdf"));
(parts ??= []).Add(ResponseContentPart.CreateInputFilePart(BinaryData.FromBytes(dataContent.Data, dataContent.MediaType), dataContent.MediaType, dataContent.Name ?? $"{Guid.NewGuid():N}.pdf"));
break;

case HostedFileContent fileContent when fileContent.HasTopLevelMediaType("image"):
Expand Down Expand Up @@ -1396,7 +1407,7 @@ private static List<AIContent> ToAIContents(IEnumerable<ResponseContentPart> con

foreach (ResponseContentPart part in contents)
{
AIContent? content;
AIContent content;
switch (part.Kind)
{
case ResponseContentPartKind.InputText or ResponseContentPartKind.OutputText:
Expand All @@ -1420,13 +1431,22 @@ private static List<AIContent> ToAIContents(IEnumerable<ResponseContentPart> con
}
else if (part.InputImageUri is { } inputImageUrl)
{
content = inputImageUrl.Scheme.Equals("data", StringComparison.OrdinalIgnoreCase) ?
new DataContent(inputImageUrl) :
new UriContent(inputImageUrl, MediaTypeMap.GetMediaType(inputImageUrl.AbsoluteUri) ?? "image/*");
if (inputImageUrl.StartsWith("data:", StringComparison.OrdinalIgnoreCase))
{
content = new DataContent(inputImageUrl);
}
else if (Uri.TryCreate(inputImageUrl, UriKind.Absolute, out Uri? imageUri))
{
content = new UriContent(imageUri, OpenAIClientExtensions.ImageUriToMediaType(imageUri));
}
else
{
goto default;
}
}
else
{
content = null;
goto default;
}

break;
Expand All @@ -1443,11 +1463,8 @@ private static List<AIContent> ToAIContents(IEnumerable<ResponseContentPart> con
break;
}

if (content is not null)
{
content.RawRepresentation = part;
results.Add(content);
}
content.RawRepresentation = part;
results.Add(content);
}

return results;
Expand Down Expand Up @@ -1496,36 +1513,25 @@ private static void PopulateAnnotations(ResponseContentPart source, AIContent de

/// <summary>
/// Extracts web search queries from a <see cref="WebSearchCallResponseItem"/>.
/// OpenAI exposes both a deprecated <c>action.query</c> (singular string) and <c>action.queries</c> (string array).
/// This helper reads whichever is present, preferring the array form.
/// </summary>
private static List<string>? GetWebSearchQueries(WebSearchCallResponseItem wscri)
{
List<string>? queries = null;

// Try the newer array field first.
if (wscri.Patch.TryGetJson("$.action.queries"u8, out ReadOnlyMemory<byte> queriesJson))
if (wscri.Action is WebSearchSearchAction searchAction)
{
Utf8JsonReader reader = new(queriesJson.Span);
if (reader.Read() && reader.TokenType is JsonTokenType.StartArray)
if (searchAction.Queries is { Count: > 0 } queries)
{
while (reader.Read() && reader.TokenType is JsonTokenType.String)
{
if (reader.GetString() is string q)
{
(queries ??= []).Add(q);
}
}
return [.. queries];
}
}

// Fall back to the deprecated singular field.
if (queries is null && wscri.Patch.TryGetValue("$.action.query"u8, out string? wsQuery) && wsQuery is not null)
{
queries = [wsQuery];
#pragma warning disable CS0618 // Query is deprecated in favor of Queries; used here as a fallback
if (searchAction.Query is not null)
{
return [searchAction.Query];
}
#pragma warning restore CS0618
}

return queries;
return null;
}

/// <summary>
Expand All @@ -1534,54 +1540,20 @@ private static void PopulateAnnotations(ResponseContentPart source, AIContent de
/// </summary>
private static List<AIContent>? GetWebSearchSources(WebSearchCallResponseItem wscri)
{
if (!wscri.Patch.TryGetJson("$.action.sources"u8, out ReadOnlyMemory<byte> sourcesJson))
if (wscri.Action is not WebSearchSearchAction { Sources.Count: > 0 } searchAction)
{
return null;
}

List<AIContent>? results = null;
var reader = new Utf8JsonReader(sourcesJson.Span);
if (!reader.Read() || reader.TokenType is not JsonTokenType.StartArray)
{
return null;
}

while (reader.Read() && reader.TokenType is JsonTokenType.StartObject)
foreach (var source in searchAction.Sources)
{
string? url = null;
string? title = null;

while (reader.Read() && reader.TokenType is not JsonTokenType.EndObject)
if (source is WebSearchActionUriSource { Uri: not null } uriSource)
{
if (reader.TokenType == JsonTokenType.PropertyName)
(results ??= []).Add(new UriContent(uriSource.Uri, "text/html")
{
if (reader.ValueTextEquals("url"u8))
{
_ = reader.Read();
url = reader.GetString();
}
else if (reader.ValueTextEquals("title"u8))
{
_ = reader.Read();
title = reader.GetString();
}
else
{
_ = reader.Read();
_ = reader.TrySkip();
}
}
}

if (url is not null && Uri.TryCreate(url, UriKind.Absolute, out Uri? uri))
{
UriContent uriContent = new(uri, "text/html");
if (title is not null)
{
uriContent.AdditionalProperties = new() { ["title"] = title };
}

(results ??= []).Add(uriContent);
RawRepresentation = uriSource,
});
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
using OpenAI;
using OpenAI.Audio;

#pragma warning disable MEAI001 // Type is for evaluation purposes only
#pragma warning disable OPENAI001 // Streaming transcription segment updates are experimental
#pragma warning disable S3011 // Reflection should not be used to increase accessibility of classes, methods, or fields
#pragma warning disable SA1204 // Static elements should appear before instance elements

Expand Down Expand Up @@ -102,6 +104,11 @@ public async Task<SpeechToTextResponse> GetTextAsync(
response.EndTime = transcription.Words[wordCount - 1].EndTime;
}
}

if (transcription.Usage is AudioTranscriptionTokenUsage tokenUsage)
{
response.Usage = ToUsageDetails(tokenUsage);
}
}

return response;
Expand Down Expand Up @@ -145,8 +152,19 @@ public async IAsyncEnumerable<SpeechToTextResponseUpdate> GetStreamingTextAsync(
result.Contents = [new TextContent(deltaUpdate.Delta)];
break;

case StreamingAudioTranscriptionTextSegmentUpdate segmentUpdate:
result.Kind = SpeechToTextResponseUpdateKind.TextUpdated;
result.StartTime = segmentUpdate.StartTime;
result.EndTime = segmentUpdate.EndTime;
break;

case StreamingAudioTranscriptionTextDoneUpdate doneUpdate:
result.Kind = SpeechToTextResponseUpdateKind.SessionClose;
if (doneUpdate.Usage is { } usage)
{
result.Contents = [new UsageContent(ToUsageDetails(usage))];
}

break;
}

Expand Down Expand Up @@ -184,4 +202,23 @@ private AudioTranslationOptions ToOpenAITranslationOptions(SpeechToTextOptions?

return result;
}

/// <summary>Maps <see cref="AudioTranscriptionTokenUsage"/> to <see cref="UsageDetails"/>.</summary>
private static UsageDetails ToUsageDetails(AudioTranscriptionTokenUsage tokenUsage)
{
var details = new UsageDetails
{
InputTokenCount = tokenUsage.InputTokenCount,
OutputTokenCount = tokenUsage.OutputTokenCount,
TotalTokenCount = tokenUsage.TotalTokenCount,
};

if (tokenUsage.InputTokenDetails is { } inputDetails)
{
details.InputAudioTokenCount = inputDetails.AudioTokenCount;
details.InputTextTokenCount = inputDetails.TextTokenCount;
}

return details;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.ClientModel;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.Runtime.CompilerServices;
Expand All @@ -12,6 +13,7 @@
using OpenAI;
using OpenAI.Audio;

#pragma warning disable OPENAI001 // Streaming speech generation is experimental
#pragma warning disable SA1204 // Static elements should appear before instance elements

namespace Microsoft.Extensions.AI;
Expand Down Expand Up @@ -78,11 +80,71 @@ public async Task<TextToSpeechResponse> GetAudioAsync(
public async IAsyncEnumerable<TextToSpeechResponseUpdate> GetStreamingAudioAsync(
string text, TextToSpeechOptions? options = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
{
// OpenAI's standard TTS API doesn't have a dedicated streaming endpoint in the SDK,
// so we fall back to the non-streaming approach and yield the result as a single update.
foreach (var update in (await GetAudioAsync(text, options, cancellationToken).ConfigureAwait(false)).ToTextToSpeechResponseUpdates())
_ = Throw.IfNull(text);

SpeechGenerationOptions openAIOptions = ToOpenAISpeechOptions(options);
string mediaType = GetMediaType(openAIOptions.ResponseFormat);

AsyncCollectionResult<StreamingSpeechUpdate>? streamingResult = null;
try
{
streamingResult = _audioClient.GenerateSpeechStreamingAsync(
text,
new GeneratedSpeechVoice(options?.VoiceId ?? DefaultVoice),
openAIOptions,
cancellationToken);
}
catch (NotSupportedException)
{
// Model doesn't support SSE streaming (e.g. tts-1, tts-1-hd).
}

if (streamingResult is null)
{
// Fall back to non-streaming for models that don't support SSE streaming.
foreach (var update in (await GetAudioAsync(text, options, cancellationToken).ConfigureAwait(false)).ToTextToSpeechResponseUpdates())
{
yield return update;
}

yield break;
}

await foreach (var update in streamingResult.ConfigureAwait(false))
{
yield return update;
switch (update)
{
case StreamingSpeechAudioDeltaUpdate deltaUpdate:
yield return new TextToSpeechResponseUpdate
{
Kind = TextToSpeechResponseUpdateKind.AudioUpdating,
Contents = [new DataContent(deltaUpdate.AudioBytes.ToMemory(), mediaType)],
ModelId = options?.ModelId ?? _metadata.DefaultModelId,
RawRepresentation = deltaUpdate,
};
break;

case StreamingSpeechAudioDoneUpdate doneUpdate:
var sessionClose = new TextToSpeechResponseUpdate
{
Kind = TextToSpeechResponseUpdateKind.SessionClose,
ModelId = options?.ModelId ?? _metadata.DefaultModelId,
RawRepresentation = doneUpdate,
};

if (doneUpdate.Usage is { } usage)
{
sessionClose.Contents = [new UsageContent(new()
{
InputTokenCount = usage.InputTokenCount,
OutputTokenCount = usage.OutputTokenCount,
TotalTokenCount = usage.TotalTokenCount,
})];
}

yield return sessionClose;
break;
}
}
}

Expand Down
Loading
Loading