HTML和PDF文档图片对比工具,支持URL和byte[]输入,支持图片过滤功能
$ dotnet add package HtmlPDFContrastImage.WindowHTML和PDF文档图片对比工具 - Windows平台NuGet包
✅ 多种输入方式 - 支持文件路径、URL和byte[]输入
✅ 图片过滤 - 支持通过byte[]过滤指定图片
✅ 灵活配置 - 所有对比参数都可配置
✅ 高性能匹配 - 使用感知哈希和匈牙利算法
✅ 异步API - 完全异步,支持高并发
dotnet add package HtmlPDFContrastImage.Window
using HtmlPDFContrastImage.Window;
// 创建对比器
await using var comparer = new HtmlPdfComparer();
// 执行对比
var result = await comparer.CompareAsync(
htmlSource: @"C:\path\to\file.html",
pdfSource: @"C:\path\to\file.pdf"
);
// 输出结果
Console.WriteLine($"匹配成功: {result.MatchedCount}/{result.HtmlImageCount}");
Console.WriteLine($"HTML匹配率: {result.MatchRateByHtml:P2}");
Console.WriteLine($"PDF匹配率: {result.MatchRateByPdf:P2}");
await using var comparer = new HtmlPdfComparer();
var result = await comparer.CompareAsync(
htmlSource: "https://example.com/document.html",
pdfSource: "https://example.com/document.pdf"
);
byte[] htmlBytes = File.ReadAllBytes("file.html");
byte[] pdfBytes = File.ReadAllBytes("file.pdf");
await using var comparer = new HtmlPdfComparer();
var result = await comparer.CompareAsync(
htmlBytes: htmlBytes,
pdfBytes: pdfBytes
);
// HTML从URL获取,PDF从本地文件
await using var comparer = new HtmlPdfComparer();
var result = await comparer.CompareAsync(
htmlSource: "https://example.com/document.html",
pdfSource: @"C:\local\file.pdf"
);
using System.Collections.Immutable;
var options = new CompareOptions
{
// 排除特定名称的图片
ExcludeImageNames = ImmutableList.Create("logo.png", "header.jpg"),
// 排除特定路径的图片
ExcludeImagePaths = ImmutableList.Create(@"C:\temp\watermark.png"),
// 哈希相似度阈值 (0.0-1.0)
HashThreshold = 0.95,
// 图片相似度阈值 (0.0-1.0)
SimilarityThreshold = 0.85,
// 匹配算法: Greedy(贪心) 或 Hungarian(匈牙利)
MatchAlgorithm = MatchAlgorithm.Hungarian,
// 相似度计算方法: PerceptualHash(感知哈希) | Histogram(直方图) | SSIM(结构相似性)
SimilarityMethod = SimilarityMethod.PerceptualHash
};
await using var comparer = new HtmlPdfComparer(options);
// 准备要过滤的图片(不参与对比)
var filterImages = new List<byte[]>
{
File.ReadAllBytes("logo.png"),
File.ReadAllBytes("watermark.png")
};
await using var comparer = new HtmlPdfComparer();
var result = await comparer.CompareAsync(
htmlSource: "document.html",
pdfSource: "document.pdf",
filterImageBytes: filterImages // 这些图片会被排除在对比之外
);
// 配置自定义的HttpClient(用于URL下载)
var httpClient = new HttpClient
{
Timeout = TimeSpan.FromMinutes(5)
};
httpClient.DefaultRequestHeaders.Add("User-Agent", "MyApp/1.0");
await using var comparer = new HtmlPdfComparer(
options: null,
httpClient: httpClient
);
// 设置最大并发数为4
await using var comparer = new HtmlPdfComparer(
options: null,
httpClient: null,
maxConcurrency: 4
);
// 批量对比
var tasks = Enumerable.Range(1, 10).Select(async i =>
{
return await comparer.CompareAsync(
htmlSource: $"file{i}.html",
pdfSource: $"file{i}.pdf"
);
});
var results = await Task.WhenAll(tasks);
public class CompareResult
{
public bool Success { get; init; } // 是否成功
public int HtmlImageCount { get; init; } // HTML图片总数
public int PdfImageCount { get; init; } // PDF图片总数
public int MatchedCount { get; init; } // 匹配成功数
public int UnmatchedHtmlCount { get; init; } // HTML未匹配数
public int UnmatchedPdfCount { get; init; } // PDF未匹配数
public double MatchRateByHtml { get; init; } // HTML匹配率 (0.0-1.0)
public double MatchRateByPdf { get; init; } // PDF匹配率 (0.0-1.0)
public TimeSpan ElapsedTime { get; init; } // 处理耗时
public List<string> Errors { get; init; } // 错误信息
public List<ImagePairInfo> MatchedPairs { get; init; } // 匹配的图片对
}
var result = await comparer.CompareAsync(...);
if (result.Success)
{
Console.WriteLine($"✅ 对比成功");
Console.WriteLine($"📊 HTML: {result.HtmlImageCount} 张图片");
Console.WriteLine($"📊 PDF: {result.PdfImageCount} 张图片");
Console.WriteLine($"✓ 成功匹配: {result.MatchedCount} 对");
Console.WriteLine($"✗ HTML未匹配: {result.UnmatchedHtmlCount} 张");
Console.WriteLine($"✗ PDF未匹配: {result.UnmatchedPdfCount} 张");
Console.WriteLine($"📈 HTML匹配率: {result.MatchRateByHtml:P2}");
Console.WriteLine($"📈 PDF匹配率: {result.MatchRateByPdf:P2}");
Console.WriteLine($"⏱ 耗时: {result.ElapsedTime.TotalSeconds:F2}秒");
// 查看详细匹配信息
foreach (var pair in result.MatchedPairs)
{
Console.WriteLine($" HTML[{pair.HtmlIndex}] ↔ PDF[{pair.PdfIndex}] 相似度: {pair.Similarity:P2}");
}
}
else
{
Console.WriteLine($"❌ 对比失败");
foreach (var error in result.Errors)
{
Console.WriteLine($" - {error}");
}
}
using HtmlPDFContrastImage.Window;
using System.Collections.Immutable;
// 配置选项
var options = new CompareOptions
{
ExcludeImageNames = ImmutableList.Create("logo.png"),
HashThreshold = 0.95,
SimilarityThreshold = 0.85,
MatchAlgorithm = MatchAlgorithm.Hungarian,
SimilarityMethod = SimilarityMethod.PerceptualHash
};
// 准备过滤图片
var filterImages = new List<byte[]>
{
await File.ReadAllBytesAsync("watermark.png")
};
// 创建对比器
await using var comparer = new HtmlPdfComparer(options, maxConcurrency: 2);
// 执行对比(支持混合输入方式)
var result = await comparer.CompareAsync(
htmlSource: "https://example.com/doc.html", // 从URL
pdfBytes: await File.ReadAllBytesAsync("local.pdf"), // 从byte[]
filterImageBytes: filterImages
);
// 输出结果
if (result.Success)
{
Console.WriteLine($"✅ 成功匹配 {result.MatchedCount}/{result.HtmlImageCount} 张图片");
Console.WriteLine($"📈 匹配率: {result.MatchRateByHtml:P2}");
}
await using或using确保正确释放资源maxConcurrency避免内存溢出如有问题或建议,请在GitHub提交Issue。
MIT License
版本: 1.0.0
作者: paper
更新日期: 2025-12-18