Speech-to-Text for .NET using Whisper, powered by AI/ML API. Transcribe audio files, generate subtitles (SRT/VTT), detect languages, and get word-level timestamps. Supports MP3, WAV, M4A, WebM, and more. Perfect for meeting transcription, podcast subtitles, voice notes, and accessibility features.
$ dotnet add package ForeverTools.STTSpeech-to-Text for .NET using Whisper, powered by AI/ML API. Transcribe audio files, generate subtitles, detect languages, and get word-level timestamps.
This package uses the AI/ML API which provides access to Whisper and 400+ AI models.
dotnet add package ForeverTools.STT
using ForeverTools.STT;
// Create client with your API key
var client = new SpeechToTextClient("your-api-key");
// Transcribe an audio file
var text = await client.TranscribeAsync("meeting.mp3");
Console.WriteLine(text);
// Set AIML_API_KEY or STT_API_KEY environment variable
var client = SpeechToTextClient.FromEnvironment();
// Simple - just get the text
var text = await client.TranscribeAsync("podcast.mp3");
// Detailed - get timestamps and metadata
var result = await client.TranscribeWithDetailsAsync("podcast.mp3");
Console.WriteLine($"Text: {result.Text}");
Console.WriteLine($"Language: {result.Language}");
Console.WriteLine($"Duration: {result.Duration}");
foreach (var segment in result.Segments)
{
Console.WriteLine($"[{segment.Start:mm\\:ss} - {segment.End:mm\\:ss}] {segment.Text}");
}
// From bytes
byte[] audioData = File.ReadAllBytes("audio.mp3");
var text = await client.TranscribeAsync(audioData, "audio.mp3");
// From stream
using var stream = File.OpenRead("audio.mp3");
var text = await client.TranscribeAsync(stream, "audio.mp3");
// Transcribe audio from a URL
var text = await client.TranscribeFromUrlAsync("https://example.com/podcast.mp3");
// Get SRT format directly
var srt = await client.TranscribeToSrtAsync("video.mp3");
File.WriteAllText("video.srt", srt);
// Output:
// 1
// 00:00:00,000 --> 00:00:05,230
// Welcome to the podcast.
//
// 2
// 00:00:05,230 --> 00:00:10,500
// Today we'll be discussing...
// Get WebVTT format
var vtt = await client.TranscribeToVttAsync("video.mp3");
File.WriteAllText("video.vtt", vtt);
// Output:
// WEBVTT
//
// 00:00:00.000 --> 00:00:05.230
// Welcome to the podcast.
//
// 00:00:05.230 --> 00:00:10.500
// Today we'll be discussing...
var result = await client.TranscribeWithDetailsAsync("audio.mp3");
// Generate subtitles from segments
var srt = SpeechToTextClient.GenerateSrt(result.Segments);
var vtt = SpeechToTextClient.GenerateVtt(result.Segments);
// Language is auto-detected by default
var result = await client.TranscribeWithDetailsAsync("audio.mp3");
Console.WriteLine($"Detected language: {result.Language}");
var result = await client.TranscribeWithDetailsAsync(new TranscriptionRequest
{
FilePath = "audio.mp3",
Language = TranscriptionLanguages.Spanish
});
var detection = await client.DetectLanguageAsync("audio.mp3");
Console.WriteLine($"Language: {detection.LanguageName} ({detection.LanguageCode})");
var result = await client.TranscribeWithDetailsAsync(new TranscriptionRequest
{
FilePath = "meeting.mp3",
Model = SttModels.WhisperLargeV3, // Use larger model for accuracy
Language = TranscriptionLanguages.English,
Temperature = 0.2f, // Lower = more deterministic
Prompt = "Meeting about Q4 financials", // Guide vocabulary
ResponseFormat = ResponseFormats.VerboseJson
});
// Fast model (default)
var options = new SpeechToTextOptions
{
ApiKey = "your-api-key",
DefaultModel = SttModels.Whisper1
};
// High accuracy model
var options = new SpeechToTextOptions
{
ApiKey = "your-api-key",
DefaultModel = SttModels.WhisperLargeV3
};
var client = new SpeechToTextClient(options);
var files = new[] { "meeting1.mp3", "meeting2.mp3", "meeting3.mp3" };
var results = await client.TranscribeBatchAsync(files);
foreach (var result in results)
{
Console.WriteLine($"Duration: {result.Duration}, Text: {result.Text.Substring(0, 100)}...");
}
// In Program.cs
builder.Services.AddForeverToolsSTT("your-api-key");
// Or with configuration
builder.Services.AddForeverToolsSTT(options =>
{
options.ApiKey = "your-api-key";
options.DefaultModel = SttModels.WhisperLargeV3;
options.DefaultLanguage = "en";
});
// appsettings.json
{
"SpeechToText": {
"ApiKey": "your-api-key",
"DefaultModel": "whisper-1",
"DefaultLanguage": "en",
"Temperature": 0
}
}
builder.Services.AddForeverToolsSTT(builder.Configuration);
public class TranscriptionService
{
private readonly SpeechToTextClient _stt;
public TranscriptionService(SpeechToTextClient stt)
{
_stt = stt;
}
public async Task<string> TranscribeMeetingAsync(string filePath)
{
return await _stt.TranscribeAsync(filePath);
}
}
| Format | Extension | MIME Type |
|---|---|---|
| MP3 | .mp3 | audio/mpeg |
| WAV | .wav | audio/wav |
| M4A | .m4a | audio/mp4 |
| WebM | .webm | audio/webm |
| FLAC | .flac | audio/flac |
| OGG | .ogg | audio/ogg |
| MP4 | .mp4 | audio/mp4 |
| Model | Best For | Speed | Accuracy |
|---|---|---|---|
whisper-1 | General use | Fast | Good |
whisper-large-v3 | High accuracy | Slower | Excellent |
whisper-large-v3-turbo | Balanced | Medium | Very Good |
try
{
var text = await client.TranscribeAsync("audio.mp3");
}
catch (FileNotFoundException ex)
{
Console.WriteLine($"File not found: {ex.FileName}");
}
catch (ArgumentException ex)
{
Console.WriteLine($"Invalid input: {ex.Message}");
}
catch (HttpRequestException ex)
{
Console.WriteLine($"API error: {ex.Message}");
}
whisper-1 for speed, whisper-large-v3 for accuracySpeechToTextClient and reuse itMIT License - see LICENSE file for details.