当前位置: 首页 > news >正文

BPE分词器实现

bpe_tokenizer.py


import collections
import json
from typing import List, Tuple, Dictclass BPETokenizer:def __init__(self, vocab_size=32000):"""BPE分词器:param vocab_size: 词汇表大小"""self.vocab_size = vocab_sizeself.merges = []  # 合并规则表self.vocab = None  # 最终词汇表self.token_to_id = Noneself.id_to_token = Nonedef train(self, corpus: str, min_freq: int = 2):"""训练BPE分词器:param corpus: 文本语料库:param min_freq: 最小频率,低于此频率的字符对不考虑"""# 1. 将语料库分割成单词,并添加边界标记words = corpus.split()# 将每个单词转换为字符序列,添加边界标记tokenized_corpus = []for word in words:# 添加边界标记tokenized_word = ['<s>'] + list(word) + ['</s>']tokenized_corpus.extend(tokenized_word)# 2. 初始词汇表:所有字符vocab = list(set(tokenized_corpus))# 3. 训练合并规则for _ in range(self.vocab_size - len(vocab)):# 统计字符对频率pair_freq = self._get_pair_freq(tokenized_corpus)# 找出频率最高的字符对if not pair_freq:breakmax_pair = max(pair_freq, key=pair_freq.get)# 如果频率低于min_freq,停止if pair_freq[max_pair] < min_freq:break# 添加合并规则self.merges.append(max_pair)# 更新语料库:将max_pair替换为新tokentokenized_corpus = self._merge_pair_in_corpus(tokenized_corpus, max_pair)# 更新词汇表:添加新tokennew_token = max_pair[0] + max_pair[1]if new_token not in vocab:vocab.append(new_token)# 4. 创建词汇表(添加特殊标记)self.vocab = self._create_vocab(vocab)# 5. 创建token到id的映射self.token_to_id = {token: i for i, token in enumerate(self.vocab)}self.id_to_token = {i: token for i, token in enumerate(self.vocab)}def _get_pair_freq(self, tokens: List[str]) -> Dict[Tuple[str, str], int]:"""统计相邻字符对的频率"""pair_freq = collections.defaultdict(int)for i in range(len(tokens) - 1):pair = (tokens[i], tokens[i + 1])pair_freq[pair] += 1return pair_freqdef _merge_pair_in_corpus(self, tokens: List[str], pair: Tuple[str, str]) -> List[str]:"""在语料库中合并字符对"""new_tokens = []i = 0while i < len(tokens):# 如果当前token和下一个token匹配,合并if i < len(tokens) - 1 and tokens[i] == pair[0] and tokens[i + 1] == pair[1]:new_tokens.append(pair[0] + pair[1])  # 新tokeni += 2  # 跳过两个字符else:new_tokens.append(tokens[i])i += 1return new_tokensdef _create_vocab(self, vocab: List[str]) -> List[str]:"""创建最终词汇表(添加特殊标记)"""# 添加特殊标记special_tokens = ['<unk>', '<pad>', '<s>', '</s>']for token in special_tokens:if token not in vocab:vocab.append(token)# 确保词汇表大小不超过vocab_sizereturn vocab[:self.vocab_size]def tokenize(self, text: str) -> List[str]:"""分词"""# 标准化:添加边界标记tokens = ['<s>'] + list(text) + ['</s>']# 应用合并规则for pair in self.merges:tokens = self._merge_pair_in_corpus(tokens, pair)# 移除边界标记tokens = [token for token in tokens if token not in ['<s>', '</s>']]return tokensdef encode(self, text: str) -> List[int]:"""编码为token ID序列,处理未登录词"""tokens = self.tokenize(text)# 处理未登录词:使用get方法,如果token不在字典中,返回<unk>的IDreturn [self.token_to_id.get(token, self.token_to_id['<unk>']) for token in tokens]def save(self, path: str):"""保存分词器"""with open(path + '_vocab.json', 'w') as f:json.dump(self.vocab, f)with open(path + '_merges.json', 'w') as f:json.dump(self.merges, f)@classmethoddef load(cls, path: str):"""加载分词器"""# 加载词汇表with open(path + '_vocab.json', 'r') as f:vocab = json.load(f)# 加载合并规则with open(path + '_merges.json', 'r') as f:merges = json.load(f)# 创建分词器tokenizer = cls()tokenizer.vocab = vocabtokenizer.merges = mergestokenizer.token_to_id = {token: i for i, token in enumerate(vocab)}tokenizer.id_to_token = {i: token for i, token in enumerate(vocab)}return tokenizer

test.py


from tokenizer.bpe_tokenizer import BPETokenizercorpus = "The sun shone brightly over the city park, casting golden rays through the leaves of ancient oaks. People strolled along the winding paths, some with coffee cups in hand, others chatting with friends or enjoying a good book under the shade of a large tree. Children laughed as they chased each other around the playground, their shouts echoing against the backdrop of rustling leaves. A golden retriever bounded past, its tail wagging furiously, while a couple sat on a weathered bench, sharing a quiet conversation. The air carried the scent of freshly cut grass and blooming flowers, a refreshing contrast to the urban chaos just beyond the park gates. A street musician played a soft melody on his guitar, drawing a small crowd to listen. Nearby, a jogger passed by, breathing steadily, eyes fixed on the horizon. It was a perfect afternoon, where time seemed to slow down. For a moment, the worries of daily life faded away, replaced by the simple joy of being present. Even the pigeons seemed to move with a relaxed pace, pecking at crumbs scattered by kind-hearted visitors. This little oasis of calm reminded everyone that nature’s beauty is always available, if only we take a moment to notice it. As the clock ticked toward evening, the park began to quieten, but the feeling of peace lingered, a gentle reminder to pause and appreciate the small wonders of everyday life."tokenizer = BPETokenizer(vocab_size=500)
tokenizer.train(corpus)# 分词示例
text = "Hello world, my name is harvey."
tokens = tokenizer.tokenize(text)
print(f"tokens:{tokens}")# 编码示例
ids = tokenizer.encode(text)
print(f"ids:{ids}")

output

tokens:['H', 'e', 'l', 'lo', ' ', 'w', 'or', 'ld', ',', ' ', 'm', 'y', ' ', 'n', 'a', 'm', 'e', ' ', 'is', ' ', 'h', 'ar', 'v', 'e', 'y', '.']

ids:[196, 17, 6, 80, 196, 4, 129, 100, 10, 196, 13, 8, 196, 30, 22, 13, 17, 196, 191, 196, 9, 60, 11, 17, 8, 31]

http://www.jsqmd.com/news/433008/

相关文章:

  • 新鲜出炉!2026徐汇专家推荐服务优的宠物医院排行,狗狗耳道内窥镜检查/宠物绝育/狗狗隐睾绝育,宠物医院专家找哪个 - 品牌推荐师
  • 主机清单和ad-hoc
  • 2026年3月光纤激光切管机厂家推荐,资质案例售后机构深度解读 - 品牌鉴赏师
  • 折扣影票api接口对接的详细操作指南
  • Mask2Former-Swin城市景观数据集图像分割模型[特殊字符]
  • 11个免费开源后台管理系统模板
  • Mask2Former图像分割全攻略:从Swin架构到COCO实战应用 [特殊字符]
  • 刷榜冠军秒变“删库侠“?揭秘AI基座模型失控的惨烈真相!
  • Docker Desktop(详细使用流程)
  • 游戏人物移动效果对应实际刷新率对比与Client-side Prediction Interpolation调整优化
  • DeepSeek V4,下周正式登场!
  • Mask2Former图像分割技术解析[特殊字符]
  • 2026年3月手持激光焊机厂家推荐,产能专利环保三维数据全面透视 - 品牌鉴赏师
  • 【无人机编队】基于人工势场算法的多无人机复杂障碍物环境下的自主避障与路径规划附Matlab代码
  • Benchmark:大数组随机访问,和取模乘法
  • 【机器人】四足机器人+正运动设计+逆运动学解算+步态设计Matlab程序
  • 降AI工具年度盘点:2026上半年哪些工具值得续费? - 还在做实验的师兄
  • 2026年3月市场青睐的超高压反应釜厂商,速来了解,深海设备水压测试/等静压设备,超高压反应釜厂家口碑推荐 - 品牌推荐师
  • 【电力系统】PMSM电机定子绕组匝间短路故障、电机故障诊断+转子磁场损失Matlab代码
  • 知网AIGC检测不通过?别慌,这套方案帮我一次过关 - 还在做实验的师兄
  • GitHub上那些star过千的C++学习仓库与面试资源,我都整理好了!
  • 第二类斯特林数列
  • 供应链计划到底怎么做?三层计划、六个动作,一次讲清!
  • 免费降AI神器2026:新用户必看的省钱攻略 - 还在做实验的师兄
  • 信息类专业毕业设计中常见问题与难点总结
  • 蓝桥/16/B.4/水质检测
  • 多维衰老表型的蛋白质组图谱
  • 京东e卡回收,闲置秒变真金白银 - 京顺回收
  • Kriging代理模型+RSM响应面分析+NSGAII多目标优化+熵权法-TOPSIS决策MATLAB代码
  • 从0到1搭建企业数据中心:AI应用架构师的实战步骤