A BERT tokenization extension for EasyReasy KnowledgeBase with FastBertTokenizer integration
$ dotnet add package EasyReasy.KnowledgeBase.BertTokenizationBERT-based tokenizer implementation for EasyReasy.KnowledgeBase. Provides accurate token counting and text processing using the FastBertTokenizer library with BERT base uncased vocabulary.
dotnet add package EasyReasy.KnowledgeBase.BertTokenization
using EasyReasy.KnowledgeBase.BertTokenization;
// Create tokenizer
BertTokenizer tokenizer = await BertTokenizer.CreateAsync();
// Count tokens
int tokenCount = tokenizer.CountTokens("Hello, world!");
// Encode text to tokens
int[] tokens = tokenizer.Encode("This is a test sentence.");
// Decode tokens back to text
string decoded = tokenizer.Decode(tokens);
Console.WriteLine($"Token count: {tokenCount}");
Console.WriteLine($"Tokens: [{string.Join(", ", tokens)}]");
Console.WriteLine($"Decoded: {decoded}");
using EasyReasy.KnowledgeBase.BertTokenization;
using EasyReasy.KnowledgeBase.Chunking;
// Create tokenizer for use with document processing
BertTokenizer tokenizer = await BertTokenizer.CreateAsync();
// Use with section reader factory
SectionReaderFactory factory = new SectionReaderFactory(embeddingService, tokenizer);
using Stream stream = File.OpenRead("document.md");
SectionReader reader = factory.CreateForMarkdown(stream, maxTokensPerChunk: 100, maxTokensPerSection: 1000);
await foreach (List<KnowledgeFileChunk> chunks in reader.ReadSectionsAsync())
{
// Process sections with accurate token counts
}
// Configure maximum encoding tokens to prevent truncation
BertTokenizer tokenizer = await BertTokenizer.CreateAsync();
tokenizer.MaxEncodingTokens = 4096; // Default is 2048
// Count tokens for longer texts
int tokenCount = tokenizer.CountTokens("Very long document text...");Creation
static Task<BertTokenizer> CreateAsync()
static Task<BertTokenizer> CreateAsync(FastBertTokenizer.BertTokenizer tokenizer)Properties
MaxEncodingTokens: Maximum tokens allowed during encoding (default: 2048)Methods
CountTokens(string text): Count tokens in textEncode(string text): Encode text to token arrayDecode(int[] tokens): Decode tokens back to textMaxEncodingTokens limitITokenizer)MIT