当前位置: 首页 > news >正文

面试-Tokenizer训练

1 代码

# 注:不建议再重复训练tokenizer(“词典”),MiniMind已自带,此脚本仅供学习和参考。基于不同词典训练的模型将导致输出完全不统一,降低社区的模型复用性# Note: It is not recommended to re-train the tokenizer. MiniMind already includes one. This script is for learning and reference only. Training models with different tokenizers will lead to inconsistent outputs and reduce model reusability in the community.importosimportjsonfromtokenizersimportdecoders,models,pre_tokenizers,trainers,Tokenizer DATA_PATH='../dataset/pretrain_hq.jsonl'TOKENIZER_DIR='../model_learn_tokenizer/'VOCAB_SIZE=6400defget_texts(data_path):withopen(data_path,'r',encoding='utf-8')asf:fori,lineinenumerate(f):ifi>=10000:break# 实验性,可只用前10000行测试data=json.loads(line)yielddata['text']deftrain_tokenizer(data_path,tokenizer_dir,vocab_size):tokenizer=Tokenizer(models.BPE())tokenizer.pre_tokenizer=pre_tokenizers.ByteLevel(add_prefix_space=False)trainer=trainers.BpeTrainer(vocab_size=vocab_size,special_tokens=["<|endoftext|>","<|im_start|>","<|im_end|>"],show_progress=True,initial_alphabet=pre_tokenizers.ByteLevel.alphabet())texts=get_texts(data_path)tokenizer.train_from_iterator(texts,trainer=trainer)tokenizer.decoder=decoders.ByteLevel()asserttokenizer.token_to_id("<|endoftext|>")==0asserttokenizer.token_to_id("<|im_start|>")==1asserttokenizer.token_to_id("<|im_end|>")==2os.makedirs(tokenizer_dir,exist_ok=True)tokenizer.save(os.path.join(tokenizer_dir,"tokenizer.json"))tokenizer.model.save(tokenizer_dir)config={"add_bos_token":False,"add_eos_token":False,"add_prefix_space":False,"added_tokens_decoder":{"0":{"content":"<|endoftext|>","lstrip":False,"normalized":False,"rstrip":False,"single_word":False,"special":True},"1":{"content":"<|im_start|>","lstrip":False,"normalized":False,"rstrip":False,"single_word":False,"special":True},"2":{"content":"<|im_end|>","lstrip":False,"normalized":False,"rstrip":False,"single_word":False,"special":True}},"additional_special_tokens":[],"bos_token":"<|im_start|>","clean_up_tokenization_spaces":False,"eos_token":"<|im_end|>","legacy":True,"model_max_length":32768,"pad_token":"<|endoftext|>","sp_model_kwargs":{},"spaces_between_special_tokens":False,"tokenizer_class":"PreTrainedTokenizerFast","unk_token":"<|endoftext|>","chat_template":"{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0].role == 'system' %}\n {{- messages[0].content + '\\n\\n' }}\n {%- endif %}\n {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' -%}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else -%}\n {{- '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n{%- for message in messages[::-1] %}\n {%- set index = (messages|length - 1) - loop.index0 %}\n {%- if ns.multi_step_tool and message.role == \"user\" and message.content is string and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n {%- set ns.multi_step_tool = false %}\n {%- set ns.last_query_index = index %}\n {%- endif %}\n{%- endfor %}\n{%- for message in messages %}\n {%- if message.content is string %}\n {%- set content = message.content %}\n {%- else %}\n {%- set content = '' %}\n {%- endif %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n {{- '<|im_start|>' + message.role + '\\n' + content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role + '\\n' + content }}\n {%- if message.tool_calls %}\n {%- for tool_call in message.tool_calls %}\n {%- if (loop.first and content) or (not loop.first) %}\n {{- '\\n' }}\n {%- endif %}\n {%- if tool_call.function %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {%- if tool_call.arguments is string %}\n {{- tool_call.arguments }}\n {%- else %}\n {{- tool_call.arguments | tojson }}\n {%- endif %}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {%- endif %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n {%- if enable_thinking is defined and enable_thinking is false %}\n {{- '<think>\\n\\n</think>\\n\\n' }}\n {%- endif %}\n{%- endif %}"}withopen(os.path.join(tokenizer_dir,"tokenizer_config.json"),"w",encoding="utf-8")asf:json.dump(config,f,ensure_ascii=False,indent=4)print("Tokenizer training completed.")defeval_tokenizer(tokenizer_dir):fromtransformersimportAutoTokenizer tokenizer=AutoTokenizer.from_pretrained(tokenizer_dir)messages=[{"role":"system","content":"你是一个优秀的聊天机器人,总是给我正确的回应!"},{"role":"user","content":'你来自哪里?'},{"role":"assistant","content":'我来自地球'}]new_prompt=tokenizer.apply_chat_template(messages,tokenize=False)print('-'*100)print(new_prompt)print('-'*100)print('tokenizer词表长度:',len(tokenizer))model_inputs=tokenizer(new_prompt)print('encoder长度:',len(model_inputs['input_ids']))response=tokenizer.decode(model_inputs['input_ids'],skip_special_tokens=False)print('decoder一致性:',response==new_prompt,"\n")print('-'*100)print('流式解码(字节缓冲)测试:')input_ids=model_inputs['input_ids']token_cache=[]fortidininput_ids:token_cache.append(tid)current_decode=tokenizer.decode(token_cache)ifcurrent_decodeand'\ufffd'notincurrent_decode:display_ids=token_cache[0]iflen(token_cache)==1elsetoken_cache raw_tokens=[tokenizer.convert_ids_to_tokens(int(t))fortin(token_cacheifisinstance(token_cache,list)else[token_cache])]print(f'Token ID:{str(display_ids):15}-> Raw:{str(raw_tokens):20}-> Decode Str:{current_decode}')token_cache=[]if__name__=='__main__':train_tokenizer(DATA_PATH,TOKENIZER_DIR,VOCAB_SIZE)eval_tokenizer(TOKENIZER_DIR)
http://www.jsqmd.com/news/339839/

相关文章:

  • 一文理清好人事管理的底层思维是什么
  • 绿联科技冲刺港股:9个月营收64亿利润4.7亿 绿联管理与和顺四号共套现近4亿
  • 大数据平台中Eureka的多数据中心部署方案
  • iOS 27 曝光!折叠屏、AI医生、Siri整容……看完我只想说:苹果这次拼了!
  • 基于数万次真机评测,RoboChallenge 首份年度报告发布
  • 基于Springboot健身房管理系统【附源码+文档】
  • 大坝、隧道深部位移监测 节段式位移计 系统组网核心要求是什么?
  • 复杂超深基坑环境监测难,不受天气人工影响且精确度高,自动化监测优势何在?
  • 基于SpringBoot的多媒体信息共享平台毕业设计
  • 展厅迎宾接待机器人技术深度解析与主流产品选型指南 - 智造出海
  • 2026年维保服务公司品牌综合评测与选型指南 - 2026年企业推荐榜
  • 实用指南:Python文件反编译,轻松找回自己的源码
  • 谷歌太壕了!编程Agent大招至简:开源且免费,百万上下文、多模态、MCP全支持
  • 一省之精,诚意可鉴:「省酒·省省酱」初品体验报告
  • GRR-RIPPER木工推料器,美国发明专利正在发起亚马逊站内侵权投诉!(US10011037B2)
  • 15年前,小沈阳一个晚上爆红年赚上亿,如今却“销声匿迹”?
  • 普推知产:商标申请注册怎样风险低一些?
  • 普推知产:申请注册商标注意不规范汉字字形!
  • 提示工程架构师进阶:打造企业级代码生成工具实战
  • 迅雷PC版 25.0.2.1068 | 精简绿化版,磁力下载神器,高速下载
  • 技术速递|使用 GitHub Copilot SDK 将智能体集成到任何应用中
  • 自建一个Agent很难吗?一语道破,万语难明
  • 马斯克旗下太空探索公司SpaceX合并xAI:前者估值1.5万亿美元
  • 如何构建一个真正可靠的AI Agent?
  • 难绷!和阿里 P11/P12 约会相亲?女网友竟称“也没那么难钓嘛”
  • 中国儒意完成25.74亿港元债券发行 刚1420万美元投爱诗科技
  • MongoTemplate简单操作mongdb
  • 1.5亿,AI原生城市服务平台建设项目
  • Waymo融资160亿美元:估值1260亿美元 红杉与DST领投
  • 【第1章·第14节】自适应PID控制器的simulink建模与仿真1——理论分析