当前位置：首页 > news >正文

ChromaDB05-数据集导入

news 2026/3/27 1:18:55

ChromaDB05-数据集导入

ChromaDB进行公开数据集导入，本地测试使用【Qwen3-Embedding-0.6B】进行模型导入测试，Qwen3-Embedding-8B对电脑的性能要求太高了，而且速度超级慢；再次感谢KUMI的开源贡献者

1-参考网址

高考选择题数据集
KUMI可视化使用教程

2-动手实操

1-核心操作

1）下载数据集
2）编写Python脚本指定embedding模型
3）运行chromaDB向量数据库
4）运行脚本进行数据导入
5）测试导入的数据

2-下载数据集

gitclone https://www.modelscope.cn/datasets/damotest/gaokao-benchmark.git

3-执行脚本导入数据

#!/usr/bin/env python3""" 高考题库数据集导入ChromaDB脚本 将所有科目的题目数据导入到ChromaDB向量数据库中 """importjsonimportosfrompathlibimportPathfromchromadbimportClientAPIfromchromadb.utils.embedding_functionsimportSentenceTransformerEmbeddingFunctiontry:importchromadbfromchromadb.configimportSettingsexceptImportError:print("请先安装chromadb: pip install chromadb")exit(1)defload_jsonl(file_path):"""加载JSONL文件"""data=[]withopen(file_path,'r',encoding='utf-8')asf:forlineinf:data.append(json.loads(line.strip()))returndatadefcreate_embedding_text(question_data):"""创建用于嵌入的文本内容"""parts=[]# 添加文章/段落（如果有）ifquestion_data.get('passage'):parts.append(f"文章：\n{question_data['passage']}")# 添加问题ifquestion_data.get('question'):parts.append(f"问题：\n{question_data['question']}")# 添加选项（如果有）ifquestion_data.get('options'):options_text="\n".join(question_data['options'])parts.append(f"选项：\n{options_text}")# 添加答案和解析（如果有）ifquestion_data.get('label'):parts.append(f"正确答案：{question_data['label']}")ifquestion_data.get('answer'):parts.append(f"答案解析：\n{question_data['answer']}")# 添加其他元数据信息ifquestion_data.get('other'):other=question_data['other']ifisinstance(other,dict):ifother.get('source'):parts.append(f"来源：{other['source']}")ifother.get('id'):parts.append(f"ID：{other['id']}")return"\n\n".join(parts)defimport_to_chromadb(collection_name):""" 导入高考题库到ChromaDB Args: collection_name: ChromaDB collection名称 """print(f"正在初始化ChromaDB...")client=build_chrome_client()# 创建或获取collectiontry:# 如果collection已存在，删除它client.delete_collection(collection_name)print(f"已删除旧的collection:{collection_name}")except:pass# 2.2 定义 embedding 函数（使用 SentenceTransformerEmbeddingFunction）embedding_func=SentenceTransformerEmbeddingFunction(model_name='Qwen/Qwen3-Embedding-0.6B')# 2.3 创建新集合-直接指定embedding_function-后续就不用手动embedding了collection=client.create_collection(name=collection_name,metadata={"description":"高考题库数据集"},embedding_function=embedding_func)print(f"✅ 成功创建集合:{collection_name}")# 定义所有科目文件subject_files={"biology":"gaokao-biology.jsonl","chemistry":"gaokao-chemistry.jsonl","chinese":"gaokao-chinese.jsonl","english":"gaokao-english.jsonl","geography":"gaokao-geography.jsonl","mathcloze":"gaokao-mathcloze.jsonl","mathqa":"gaokao-mathqa.jsonl","physics":"gaokao-physics.jsonl"}total_questions=0all_ids=[]all_documents=[]all_metadatas=[]# 遍历所有科目forsubject,filenameinsubject_files.items():file_path=os.path.join(data_dir,filename)ifnotos.path.exists(file_path):print(f"警告: 文件不存在，跳过:{file_path}")continueprint(f"正在处理{subject}({filename})...")questions=load_jsonl(file_path)print(f" 找到{len(questions)}道题目")foridx,questioninenumerate(questions):# 创建唯一IDquestion_id=f"{subject}_{idx}"# 创建嵌入文本embedding_text=create_embedding_text(question)# 创建metadatametadata={"subject":subject,"index":idx,}# 添加label到metadataifquestion.get('label'):metadata['label']=question['label']# 添加来源信息到metadataifquestion.get('other')andisinstance(question['other'],dict):ifquestion['other'].get('source'):metadata['source']=question['other']['source']ifquestion['other'].get('id'):metadata['question_id']=question['other']['id']# 添加到批次列表all_ids.append(question_id)all_documents.append(embedding_text)all_metadatas.append(metadata)total_questions+=len(questions)print(f"\n正在批量导入{total_questions}道题目到ChromaDB...")# 批量添加到collection（ChromaDB会自动使用默认模型生成embedding）ifall_ids:collection.add(ids=all_ids,documents=all_documents,metadatas=all_metadatas)print(f"✓ 成功导入{total_questions}道题目！")print(f"✓ Collection名称:{collection_name}")else:print("没有导入任何数据")defbuild_chrome_client()->ClientAPI:# 初始化ChromaDB客户端（服务器模式）client=chromadb.HttpClient(host='localhost',port=8000)print("✓ 成功连接到 ChromaDB 服务器")returnclientdefquery_example(collection_name):"""示例查询函数"""print("\n"+"="*60)print("查询示例")print("="*60)client=build_chrome_client()collection=client.get_collection(collection_name)# 示例1: 查询数学相关的问题query_text="当人体的免疫系统将自身物质当作外来异物进行攻击时, 可引起自身免疫病。下列属于自 身免疫病的是"print(f"\n查询: '{query_text}'")print("-"*60)results=collection.query(query_texts=[query_text],n_results=3)fori,(doc,metadata,distance)inenumerate(zip(results['documents'][0],results['metadatas'][0],results['distances'][0])):print(f"\n结果{i+1}:")print(f" 科目:{metadata.get('subject','N/A')}")print(f" 距离:{distance:.4f}")print(f" 内容:\n{doc[:200]}..."iflen(doc)>200elsef" 内容:\n{doc}")if__name__=="__main__":# 设置数据目录data_dir=os.path.dirname(os.path.abspath(__file__))print("="*60)print("高考题库导入ChromaDB")print("="*60)print(f"数据目录:{data_dir}")print()collection_name="gaokao_questions_qwen"# 导入数据import_to_chromadb(collection_name=collection_name)# 执行示例查询query_example(collection_name=collection_name)print("\n"+"="*60)print("导入完成！")print("="*60)