Skip to content

Commit 60fdd7e

Browse files
committed
Enable downsampling of microphone input for speech recognition
- Introduce `TargetSampleRate` property on the `SpeechListener` classes for speech recognition. - When `TargetSampleRate` > 0, downsample microphone input to the specified rate. - Setting `TargetSampleRate` to 0 (default) bypasses downsampling and uses raw input. - Downsampling reduces audio payload size, enabling smooth speech recognition over narrow-bandwidth networks.
1 parent e604b84 commit 60fdd7e

File tree

4 files changed

+46
-13
lines changed

4 files changed

+46
-13
lines changed

Scripts/SpeechListener/AzureSpeechListener.cs

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,27 +15,27 @@ public class AzureSpeechListener : SpeechListenerBase
1515
public string Region = string.Empty;
1616
public bool UseClassic = false;
1717

18-
protected override async UniTask<string> ProcessTranscriptionAsync(float[] samples, CancellationToken token)
18+
protected override async UniTask<string> ProcessTranscriptionAsync(float[] samples, int sampleRate, CancellationToken token)
1919
{
2020
if (UseClassic)
2121
{
22-
return await ProcessTranscriptionClassicAsync(samples, token);
22+
return await ProcessTranscriptionClassicAsync(samples, sampleRate, token);
2323
}
2424
else
2525
{
26-
return await ProcessTranscriptionFastAsync(samples, token);
26+
return await ProcessTranscriptionFastAsync(samples, sampleRate, token);
2727
}
2828
}
2929

30-
protected async UniTask<string> ProcessTranscriptionClassicAsync(float[] samples, CancellationToken token)
30+
protected async UniTask<string> ProcessTranscriptionClassicAsync(float[] samples, int sampleRate, CancellationToken token)
3131
{
3232
if (string.IsNullOrEmpty(ApiKey) || string.IsNullOrEmpty(Region) || string.IsNullOrEmpty(Language))
3333
{
3434
Debug.LogError("API Key, Region and Language are missing for AzureSpeechListener");
3535
}
3636

3737
var url = $"https://{Region}.stt.speech.microsoft.com/speech/recognition/conversation/cognitiveservices/v1?language={Language}";
38-
var requestData = SampleToPCM(samples, microphoneManager.SampleRate, 1);
38+
var requestData = SampleToPCM(samples, sampleRate, 1);
3939

4040
using (UnityWebRequest request = new UnityWebRequest(url, "POST"))
4141
{
@@ -58,7 +58,7 @@ protected async UniTask<string> ProcessTranscriptionClassicAsync(float[] samples
5858
}
5959
}
6060

61-
protected async UniTask<string> ProcessTranscriptionFastAsync(float[] samples, CancellationToken token)
61+
protected async UniTask<string> ProcessTranscriptionFastAsync(float[] samples, int sampleRate, CancellationToken token)
6262
{
6363
if (string.IsNullOrEmpty(ApiKey) || string.IsNullOrEmpty(Region) || string.IsNullOrEmpty(Language))
6464
{
@@ -78,7 +78,7 @@ protected async UniTask<string> ProcessTranscriptionFastAsync(float[] samples, C
7878
{"locales", locales},
7979
{"channels", new List<int>(){0, 1}}
8080
}));
81-
form.AddBinaryData("audio", SampleToPCM(samples, microphoneManager.SampleRate, 1), "voice.wav");
81+
form.AddBinaryData("audio", SampleToPCM(samples, sampleRate, 1), "voice.wav");
8282

8383
using (UnityWebRequest request = UnityWebRequest.Post(url, form))
8484
{

Scripts/SpeechListener/GoogleSpeechListener.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ public class GoogleSpeechListener : SpeechListenerBase
1616
public bool UseEnhancedModel = false;
1717
public List<SpeechContext> SpeechContexts;
1818

19-
protected override async UniTask<string> ProcessTranscriptionAsync(float[] samples, CancellationToken token)
19+
protected override async UniTask<string> ProcessTranscriptionAsync(float[] samples, int sampleRate, CancellationToken token)
2020
{
2121
if (string.IsNullOrEmpty(ApiKey) || string.IsNullOrEmpty(Language))
2222
{
@@ -25,7 +25,7 @@ protected override async UniTask<string> ProcessTranscriptionAsync(float[] sampl
2525

2626
var url = $"https://speech.googleapis.com/v1/speech:recognize?key={ApiKey}";
2727
var requestData = new SpeechRecognitionRequest(
28-
microphoneManager.SampleRate, 1, Language, UseEnhancedModel, SpeechContexts, samples
28+
sampleRate, 1, Language, UseEnhancedModel, SpeechContexts, samples
2929
);
3030
if (AlternativeLanguages?.Count > 0)
3131
{

Scripts/SpeechListener/OpenAISpeechListener.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ public class OpenAISpeechListener : SpeechListenerBase
1515
public float Temperature = 0.0f;
1616

1717
// See API document: https://platform.openai.com/docs/api-reference/audio/createTranscription
18-
protected override async UniTask<string> ProcessTranscriptionAsync(float[] samples, CancellationToken token)
18+
protected override async UniTask<string> ProcessTranscriptionAsync(float[] samples, int sampleRate, CancellationToken token)
1919
{
2020
if (string.IsNullOrEmpty(ApiKey) || string.IsNullOrEmpty(Model))
2121
{
@@ -29,7 +29,7 @@ protected override async UniTask<string> ProcessTranscriptionAsync(float[] sampl
2929
form.AddField("language", Language.Contains("-") ? Language.Split("-")[0] : Language);
3030
}
3131
form.AddField("response_format", "text");
32-
form.AddBinaryData("file", SampleToPCM(samples, microphoneManager.SampleRate, 1), "voice.wav"); // filename is required to transcribe
32+
form.AddBinaryData("file", SampleToPCM(samples, sampleRate, 1), "voice.wav"); // filename is required to transcribe
3333
if (!string.IsNullOrEmpty(Prompt))
3434
{
3535
form.AddField("prompt", Prompt);

Scripts/SpeechListener/SpeechListenerBase.cs

Lines changed: 35 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ public class SpeechListenerBase : MonoBehaviour, ISpeechListener
1818
public List<string> AlternativeLanguages;
1919
public bool AutoStart = true;
2020
public bool PrintResult = false;
21+
public int TargetSampleRate = 0;
2122

2223
public Func<string, UniTask> OnRecognized { get; set; }
2324

@@ -78,11 +79,43 @@ public void ChangeSessionConfig(float silenceDurationThreshold = float.MinValue,
7879
StartListening(true);
7980
}
8081

82+
private float[] Resample(float[] samples, int originalRate, int targetRate)
83+
{
84+
if (originalRate == targetRate) return samples;
85+
86+
int dstLength = Mathf.CeilToInt(samples.Length * (targetRate / (float)originalRate));
87+
var dst = new float[dstLength];
88+
float ratio = samples.Length / (float)dstLength;
89+
90+
for (int i = 0; i < dstLength; i++)
91+
{
92+
float srcIndex = i * ratio;
93+
int i0 = Mathf.FloorToInt(srcIndex);
94+
int i1 = Mathf.Min(i0 + 1, samples.Length - 1);
95+
float t = srcIndex - i0;
96+
dst[i] = Mathf.Lerp(samples[i0], samples[i1], t);
97+
}
98+
return dst;
99+
}
100+
81101
protected async UniTask HandleRecordingCompleteAsync(float[] samples, CancellationToken token)
82102
{
83103
try
84104
{
85-
var text = await ProcessTranscriptionAsync(samples, token);
105+
float[] samplesToTranscript;
106+
int sampleRate;
107+
if (TargetSampleRate > 0 && microphoneManager.SampleRate > TargetSampleRate)
108+
{
109+
sampleRate = TargetSampleRate;
110+
samplesToTranscript = Resample(samples, microphoneManager.SampleRate, TargetSampleRate);
111+
}
112+
else
113+
{
114+
sampleRate = microphoneManager.SampleRate;
115+
samplesToTranscript = samples;
116+
}
117+
118+
var text = await ProcessTranscriptionAsync(samplesToTranscript, sampleRate, token);
86119
if (PrintResult)
87120
{
88121
Debug.Log($"Speech recognized: {text} ({Name})");
@@ -106,7 +139,7 @@ protected async UniTask HandleRecordingCompleteAsync(float[] samples, Cancellati
106139
}
107140

108141
#pragma warning disable CS1998
109-
protected virtual async UniTask<string> ProcessTranscriptionAsync(float[] samples, CancellationToken token)
142+
protected virtual async UniTask<string> ProcessTranscriptionAsync(float[] samples, int sampleRate, CancellationToken token)
110143
{
111144
throw new NotImplementedException($"ProcessTranscriptionAsync for {Name} is not implemented");
112145
}

0 commit comments

Comments
 (0)