当前位置：首页 > news >正文

BPE分词器实现

news 2026/4/13 12:14:52

bpe_tokenizer.py


import collections
import json
from typing import List, Tuple, Dictclass BPETokenizer:def __init__(self, vocab_size=32000):"""BPE分词器:param vocab_size: 词汇表大小"""self.vocab_size = vocab_sizeself.merges = []  # 合并规则表self.vocab = None  # 最终词汇表self.token_to_id = Noneself.id_to_token = Nonedef train(self, corpus: str, min_freq: int = 2):"""训练BPE分词器:param corpus: 文本语料库:param min_freq: 最小频率，低于此频率的字符对不考虑"""# 1. 将语料库分割成单词，并添加边界标记words = corpus.split()# 将每个单词转换为字符序列，添加边界标记tokenized_corpus = []for word in words:# 添加边界标记tokenized_word = ['<s>'] + list(word) + ['</s>']tokenized_corpus.extend(tokenized_word)# 2. 初始词汇表：所有字符vocab = list(set(tokenized_corpus))# 3. 训练合并规则for _ in range(self.vocab_size - len(vocab)):# 统计字符对频率pair_freq = self._get_pair_freq(tokenized_corpus)# 找出频率最高的字符对if not pair_freq:breakmax_pair = max(pair_freq, key=pair_freq.get)# 如果频率低于min_freq，停止if pair_freq[max_pair] < min_freq:break# 添加合并规则self.merges.append(max_pair)# 更新语料库：将max_pair替换为新tokentokenized_corpus = self._merge_pair_in_corpus(tokenized_corpus, max_pair)# 更新词汇表：添加新tokennew_token = max_pair[0] + max_pair[1]if new_token not in vocab:vocab.append(new_token)# 4. 创建词汇表（添加特殊标记）self.vocab = self._create_vocab(vocab)# 5. 创建token到id的映射self.token_to_id = {token: i for i, token in enumerate(self.vocab)}self.id_to_token = {i: token for i, token in enumerate(self.vocab)}def _get_pair_freq(self, tokens: List[str]) -> Dict[Tuple[str, str], int]:"""统计相邻字符对的频率"""pair_freq = collections.defaultdict(int)for i in range(len(tokens) - 1):pair = (tokens[i], tokens[i + 1])pair_freq[pair] += 1return pair_freqdef _merge_pair_in_corpus(self, tokens: List[str], pair: Tuple[str, str]) -> List[str]:"""在语料库中合并字符对"""new_tokens = []i = 0while i < len(tokens):# 如果当前token和下一个token匹配，合并if i < len(tokens) - 1 and tokens[i] == pair[0] and tokens[i + 1] == pair[1]:new_tokens.append(pair[0] + pair[1])  # 新tokeni += 2  # 跳过两个字符else:new_tokens.append(tokens[i])i += 1return new_tokensdef _create_vocab(self, vocab: List[str]) -> List[str]:"""创建最终词汇表（添加特殊标记）"""# 添加特殊标记special_tokens = ['<unk>', '<pad>', '<s>', '</s>']for token in special_tokens:if token not in vocab:vocab.append(token)# 确保词汇表大小不超过vocab_sizereturn vocab[:self.vocab_size]def tokenize(self, text: str) -> List[str]:"""分词"""# 标准化：添加边界标记tokens = ['<s>'] + list(text) + ['</s>']# 应用合并规则for pair in self.merges:tokens = self._merge_pair_in_corpus(tokens, pair)# 移除边界标记tokens = [token for token in tokens if token not in ['<s>', '</s>']]return tokensdef encode(self, text: str) -> List[int]:"""编码为token ID序列，处理未登录词"""tokens = self.tokenize(text)# 处理未登录词：使用get方法，如果token不在字典中，返回<unk>的IDreturn [self.token_to_id.get(token, self.token_to_id['<unk>']) for token in tokens]def save(self, path: str):"""保存分词器"""with open(path + '_vocab.json', 'w') as f:json.dump(self.vocab, f)with open(path + '_merges.json', 'w') as f:json.dump(self.merges, f)@classmethoddef load(cls, path: str):"""加载分词器"""# 加载词汇表with open(path + '_vocab.json', 'r') as f:vocab = json.load(f)# 加载合并规则with open(path + '_merges.json', 'r') as f:merges = json.load(f)# 创建分词器tokenizer = cls()tokenizer.vocab = vocabtokenizer.merges = mergestokenizer.token_to_id = {token: i for i, token in enumerate(vocab)}tokenizer.id_to_token = {i: token for i, token in enumerate(vocab)}return tokenizer

test.py


from tokenizer.bpe_tokenizer import BPETokenizercorpus = "The sun shone brightly over the city park, casting golden rays through the leaves of ancient oaks. People strolled along the winding paths, some with coffee cups in hand, others chatting with friends or enjoying a good book under the shade of a large tree. Children laughed as they chased each other around the playground, their shouts echoing against the backdrop of rustling leaves. A golden retriever bounded past, its tail wagging furiously, while a couple sat on a weathered bench, sharing a quiet conversation. The air carried the scent of freshly cut grass and blooming flowers, a refreshing contrast to the urban chaos just beyond the park gates. A street musician played a soft melody on his guitar, drawing a small crowd to listen. Nearby, a jogger passed by, breathing steadily, eyes fixed on the horizon. It was a perfect afternoon, where time seemed to slow down. For a moment, the worries of daily life faded away, replaced by the simple joy of being present. Even the pigeons seemed to move with a relaxed pace, pecking at crumbs scattered by kind-hearted visitors. This little oasis of calm reminded everyone that nature’s beauty is always available, if only we take a moment to notice it. As the clock ticked toward evening, the park began to quieten, but the feeling of peace lingered, a gentle reminder to pause and appreciate the small wonders of everyday life."tokenizer = BPETokenizer(vocab_size=500)
tokenizer.train(corpus)# 分词示例
text = "Hello world, my name is harvey."
tokens = tokenizer.tokenize(text)
print(f"tokens:{tokens}")# 编码示例
ids = tokenizer.encode(text)
print(f"ids:{ids}")