当前位置: 首页 > news >正文

训练 Tokenizer - yi

常用数据集
import random
import json
import os
from transformers import AutoTokenizer, PreTrainedTokenizerFast
from tokenizers import (decoders,models,pre_tokenizers,trainers,Tokenizer,
)
from tokenizers.normalizers import NFKC
from typing import Generatorrandom.seed(42)def read_texts_from_jsonl(file_path: str) -> Generator[str, None, None]:"""读取JSONL文件并安全提取文本数据"""with open(file_path, 'r', encoding='utf-8') as f:for line_num, line in enumerate(f, 1):try:data = json.loads(line)if 'text' not in data:raise KeyError(f"Missing 'text' field in line {line_num}")yield data['text']except json.JSONDecodeError:print(f"Error decoding JSON in line {line_num}")continueexcept KeyError as e:print(e)continuedef create_tokenizer_config(save_dir: str) -> None:"""创建完整的tokenizer配置文件"""config = {"add_bos_token": False,"add_eos_token": False,"add_prefix_space": True,"bos_token": "<|im_start|>","eos_token": "<|im_end|>","pad_token": "<|im_end|>","unk_token": "<unk>","model_max_length": 1000000000000000019884624838656,"clean_up_tokenization_spaces": False,"tokenizer_class": "PreTrainedTokenizerFast","chat_template": ("{% for message in messages %}""{% if message['role'] == 'system' %}""<|im_start|>system\n{{ message['content'] }}<|im_end|>\n""{% elif message['role'] == 'user' %}""<|im_start|>user\n{{ message['content'] }}<|im_end|>\n""{% elif message['role'] == 'assistant' %}""<|im_start|>assistant\n{{ message['content'] }}<|im_end|>\n""{% endif %}""{% endfor %}""{% if add_generation_prompt %}""{{ '<|im_start|>assistant\n' }}""{% endif %}")}# 保存主配置文件with open(os.path.join(save_dir, "tokenizer_config.json"), "w", encoding="utf-8") as f:json.dump(config, f, ensure_ascii=False, indent=4)# 创建special_tokens_map.jsonspecial_tokens_map = {"bos_token": "<|im_start|>","eos_token": "<|im_end|>","unk_token": "<unk>","pad_token": "<|im_end|>","additional_special_tokens": ["<s>", "</s>"]}with open(os.path.join(save_dir, "special_tokens_map.json"), "w", encoding="utf-8") as f:json.dump(special_tokens_map, f, ensure_ascii=False, indent=4)def train_tokenizer(data_path: str, save_dir: str, vocab_size: int = 8192) -> None:"""训练并保存自定义tokenizer"""os.makedirs(save_dir, exist_ok=True)# 初始化tokenizertokenizer = Tokenizer(models.BPE(unk_token="<unk>"))tokenizer.normalizer = NFKC()  # 添加文本规范化tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)tokenizer.decoder = decoders.ByteLevel()# 配置特殊tokenspecial_tokens = ["<unk>", "<s>", "</s>", "<|im_start|>", "<|im_end|>"]# 配置训练器trainer = trainers.BpeTrainer(vocab_size=vocab_size,special_tokens=special_tokens,min_frequency=2,  # 提高低频词过滤show_progress=True,initial_alphabet=pre_tokenizers.ByteLevel.alphabet())# 训练tokenizerprint(f"Training tokenizer with data from {data_path}")texts = read_texts_from_jsonl(data_path)tokenizer.train_from_iterator(texts, trainer=trainer, length=os.path.getsize(data_path))# 验证特殊token映射try:assert tokenizer.token_to_id("<unk>") == 0assert tokenizer.token_to_id("<s>") == 1assert tokenizer.token_to_id("</s>") == 2assert tokenizer.token_to_id("<|im_start|>") == 3assert tokenizer.token_to_id("<|im_end|>") == 4except AssertionError as e:print("Special tokens mapping error:", e)raise# 保存tokenizer文件tokenizer.save(os.path.join(save_dir, "tokenizer.json"))# 创建配置文件
    create_tokenizer_config(save_dir)print(f"Tokenizer saved to {save_dir}")def eval_tokenizer(tokenizer_path: str) -> None:"""评估tokenizer功能"""try:tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)except Exception as e:print(f"Error loading tokenizer: {e}")return# 测试基本属性print("\n=== Tokenizer基本信息 ===")print(f"Vocab size: {len(tokenizer)}")print(f"Special tokens: {tokenizer.all_special_tokens}")print(f"Special token IDs: {tokenizer.all_special_ids}")# 测试聊天模板messages = [{"role": "system", "content": "你是一个AI助手。"},{"role": "user", "content": "How are you?"},{"role": "assistant", "content": "I'm fine, thank you. and you?"},{"role": "user", "content": "I'm good too."},{"role": "assistant", "content": "That's great to hear!"},]print("\n=== 聊天模板测试 ===")prompt = tokenizer.apply_chat_template(messages, tokenize=False, # add_generation_prompt=True
    )print("Generated prompt:\n", prompt, sep="")# 测试编码解码print("\n=== 编码解码测试 ===")encoded = tokenizer(prompt, truncation=True, max_length=256)decoded = tokenizer.decode(encoded["input_ids"], skip_special_tokens=False)print("Decoded text matches original:", decoded == prompt)# 测试特殊token处理print("\n=== 特殊token处理 ===")test_text = "<|im_start|>user\nHello<|im_end|>"encoded = tokenizer(test_text).input_idsdecoded = tokenizer.decode(encoded)print(f"Original: {test_text}")print(f"Decoded:  {decoded}")print("Special tokens preserved:", decoded == test_text)def main():# 配置路径data_path = "your data path"save_dir = "tokenizer_k"# 训练tokenizer
    train_tokenizer(data_path=data_path,save_dir=save_dir,vocab_size=6144)# 评估tokenizer
    eval_tokenizer(save_dir)if __name__ == '__main__':main()

 

常用数据集
http://www.jsqmd.com/news/541127/

相关文章:

  • Apache ShenYu API 网关项目教程
  • 如何使用Cobalt实现与Notion、Obsidian的无缝集成:完整指南
  • 基于YOLO Tracking的实时人体姿态跟踪实现教程
  • Go gRPC中间件v2升级指南:从v1到v2的完整迁移策略
  • HertzBeat高性能集群架构深度解析:如何支撑大规模监控场景的终极指南
  • SEO_详解SEO优化的常见误区及解决办法(474 )
  • Mermaid CLI终极指南:3分钟掌握命令行图表生成神器
  • 游戏模组革命:BepInEx插件框架如何彻底改变你的游戏体验?
  • MangoHud与HDR视频编码:质量与性能监控终极指南
  • 如何快速上手Apache OpenWhisk Python动作开发:完整指南与实战教程
  • Apache Kyuubi 核心技术术语解析
  • Markdown Viewer自定义主题:从样式定制到场景落地的全指南
  • HelloWorld.h:嵌入式LED硬件抽象库设计与实战
  • 对抗攻击新思路:为什么Diffusion模型比GAN更适合生成隐蔽攻击样本?
  • Nacos 1.4.0启动失败?可能是你的Tomcat嵌入式容器配置有问题
  • 超实用dc.js性能优化指南:让大数据可视化提速50%的终极技巧
  • 如何为Fantasque Sans字体项目贡献代码:完整开源字体开发指南
  • 3步精通pinyinjs:从基础转换到企业级应用
  • 人工智能入门学习DAY3
  • 英雄联盟智能工具League-Toolkit:效率提升与智能辅助完全指南
  • 白发转黑哪个品牌有效?黑奥秘头皮生态论,根源调理更专业 - 美业信息观察
  • TVM构建系统详解:CMake与Makefile配置最佳实践
  • TagStudio自定义主题开发终极指南:打造个性化视觉体验
  • 在 C# 中,原子操作主要通过 System.Threading 命名空间中的工具和 Interlocked 类实现,用于确保多线程环境下的线程安全操作
  • 白转黑哪个养发机构更专业?黑奥秘AI智能检测,千人千方更精准 - 美业信息观察
  • HertzBeat自定义监控模板开发终极指南:打造专属监控能力 [特殊字符]
  • 手把手教你用MATLAB读取南极洲流域边界SHP文件(附避坑指南)
  • Leaflet地图定位全攻略:从点位到多边形的4种实战方法(附代码)
  • Day 7
  • AI检测率太高论文过不了?这4个AI写作智能降重工具2026年必须用!