pkg

ZentrixLabs/ZentrixLabs.OcrCorrectionv1.0.1

.NET 8.0

A comprehensive .NET library for correcting OCR errors in English text with ~837 battle-tested patterns. Specifically designed for Tesseract PGS subtitle extraction, achieving 100% success rate on tested corpus. Handles capital I/lowercase l confusion, spacing errors, apostrophe issues, and number/letter confusion. Zero false positives, modular architecture, multi-pass processing.

License

MIT

Deps

1

Install Size

—

Vulns

✓ 0

Published

Oct 13, 2025

Get Started

$ dotnet add package ZentrixLabs.OcrCorrection

Readme

ZentrixLabs.OcrCorrection

A comprehensive .NET library for correcting common OCR errors in English text, specifically designed for subtitle extraction and document digitization workflows.

Features

~837 Comprehensive Patterns - Extensively tested correction patterns for English OCR
Multi-Pass Processing - Iterative correction until convergence or max passes reached
High Performance - Pre-compiled regex patterns process feature-length films in ~1 second
Zero False Positives - Carefully curated patterns avoid breaking valid text
Modular Architecture - Organized pattern categories (Character Substitution, Spacing, Apostrophes, Numbers)
Detailed Logging - Track corrections by category with performance metrics
Dependency Injection Ready - Designed for modern .NET applications

Common OCR Errors Fixed

Capital I ↔ Lowercase l Confusion

The most common OCR error in subtitles. Our library handles ~660 specific patterns:

HeIIo → Hello
I'm gIad → I'm glad
TeII me → Tell me
stiII → still

Spacing Errors (~281 patterns)

Extra spaces: th e → the, wh at → what
Missing spaces: thejob → the job, ofthose → of those
-tion/-ation words: confus i on → confusion, inform at i on →

information

dotnet add package ZentrixLabs.OcrCorrection

Install-Package ZentrixLabs.OcrCorrection

using ZentrixLabs.OcrCorrection.Core;
using ZentrixLabs.OcrCorrection.Patterns;

// Create the correction engine
var patternProvider = new EnglishPatternProvider();
var engine = new OcrCorrectionEngine(patternProvider);

// Correct OCR errors
var text = "HeIIo! I dont think th is looks right.";
var result = engine.CorrectText(text);

Console.WriteLine(result.CorrectedText);
// Output: "Hello! I don't think this looks right."

Console.WriteLine($"Corrections made: {result.CorrectionsMade}");
// Output: "Corrections made: 4"

var options = new CorrectionOptions
{
    IncludeDetailedLog = true,
    IncludePerformanceMetrics = true,
    IncludeCorrectionDetails = true
};

var result = engine.CorrectText(text, options);

foreach (var detail in result.CorrectionDetails)
{
    Console.WriteLine($"{detail.Pattern}: '{detail.Original}' → '{detail.Corrected}'");
}

using ZentrixLabs.OcrCorrection.Passes;

var multiPass = new MultiPassProcessor(engine);

var result = await multiPass.ProcessAsync(
    text, 
    maxPasses: 5,
    options: new CorrectionOptions { IncludeDetailedLog = true }
);

Console.WriteLine($"Converged after {result.PassesCompleted} passes");
Console.WriteLine($"Total corrections: {result.TotalCorrections}");

using Microsoft.Extensions.DependencyInjection;
using ZentrixLabs.OcrCorrection.Extensions;

var services = new ServiceCollection();
services.AddOcrCorrection();

var serviceProvider = services.BuildServiceProvider();
var engine = serviceProvider.GetRequiredService<IOcrCorrectionEngine>();

// Read SRT file
var srtContent = File.ReadAllText("movie.srt");

// Correct OCR errors
var result = engine.CorrectText(srtContent);

// Save corrected SRT
File.WriteAllText("movie_corrected.srt", result.CorrectedText);

Console.WriteLine($"✅ Corrected {result.CorrectionsMade} errors");

Film	Subtitles	Corrections	Result
28 Weeks Later (2007)	1,237	2	✅ Perfect
28 Years Later (2025)	1,231	4	✅ Perfect
28 Days Later (2002)	1,232	42	✅ Perfect
Alien (1979)	984	109	✅ Perfect
Alien: Covenant (2017)	1,515	158	✅ Perfect
A View to a Kill (1985)	965	190	✅ Perfect

var options = new CorrectionOptions
{
    // Include detailed correction log
    IncludeDetailedLog = true,
    
    // Include performance metrics (processing time, etc.)
    IncludePerformanceMetrics = true,
    
    // Include details about each correction made
    IncludeCorrectionDetails = true,
    
    // Exclude specific pattern categories
    ExcludedCategories = new[] { "Numbers" },
    
    // Context-aware capitalization (experimental, disabled by default)
    UseContextAwareCapitalization = false
};

var patternProvider = new EnglishPatternProvider();

// Get only spacing-related patterns
var spacingPatterns = patternProvider.GetPatternsByCategory("Spacing");

// Get all available categories
var categories = patternProvider.GetCategories();
// Returns: ["Apostrophes", "Capitalization", "Character Substitution", "Numbers", "Spacing"]

public class MyCustomPatternProvider : IPatternProvider
{
    public string Name => "Custom Patterns";
    public string LanguageCode => "en";
    
    public IEnumerable<CorrectionPattern> GetPatterns()
    {
        return new[]
        {
            new CorrectionPattern(@"\bcustomerror\b", "custom error", "Custom")
            {
                Description = "Fix custom error",
                Priority = 50
            }
        };
    }
    
    // ... implement other interface members
}

// Use custom provider
var engine = new OcrCorrectionEngine(new MyCustomPatternProvider());

// ❌ DANGEROUS - breaks valid words
(\w)(are)(\s) → "$1 $2$3"  // Breaks: "fanfare" → "fanf are"
(\w)(he)(\s) → "$1 $2$3"   // Breaks: "she" → "s he"