LangChain-AI应用开发框架(十一)
一.嵌入与嵌入模型(Embedding and Embedding Models)
1.什么是向量?
2.嵌入模型应用场景
二.Embeddings嵌入模型类
更多的链接:https://docs.langchain.com/oss/python/integrations/providers/overview
1.定义嵌入模型
pip install --upgrade langchain langchain-community zhipuai# 直接替换:将 OpenAI 改为智谱的 Embeddings 类 from langchain_community.embeddings import ZhipuAIEmbeddings # 定义嵌入模型(1:1 对应你的 OpenAI 写法) embeddings = ZhipuAIEmbeddings( model="embedding-3", # 对应 OpenAI 的 text-embedding-3-large dimensions=1024 # 支持自定义维度:256/512/1024/2048 )2.嵌入文档列表
import os import getpass # 1. 配置 API Key(智谱AI) if not os.getenv("ZHIPUAI_API_KEY"): os.environ["ZHIPUAI_API_KEY"] = getpass.getpass("请输入智谱AI API Key: ") # 2. 导入基础模块(保持不变) from langchain_community.document_loaders import UnstructuredMarkdownLoader from langchain_text_splitters import CharacterTextSplitter # 3. 核心替换:导入智谱的 Embeddings 类 from langchain_community.embeddings import ZhipuAIEmbeddings # 加载 Markdown 文件 markdown_path = "../Docs/Markdown/脚手架级微服务租房平台Q&A.md" loader = UnstructuredMarkdownLoader(markdown_path) data = loader.load() # 分割文档 text_splitter = CharacterTextSplitter.from_tiktoken_encoder( encoding_name="cl100k_base", chunk_size=200, chunk_overlap=50 ) documents = text_splitter.split_documents(data) # 4. 定义嵌入模型(1:1 对应你的 OpenAI 写法) embeddings = ZhipuAIEmbeddings( model="embedding-3", # 对应 OpenAI 的 text-embedding-3-large dimensions=1024 # 智谱支持自定义维度(256~2048) ) # 5. 嵌入文档列表,生成向量列表 texts = [doc.page_content for doc in documents] documents_vector = embeddings.embed_documents(texts) # 6. 打印结果 print(f"文档数量为: {len(documents)}, 生成了{len(documents_vector)}个向量的列表") print(f"第一个文档向量维度: {len(documents_vector[0])}") print(f"第二个文档向量维度: {len(documents_vector[1])}")3.嵌入单个查询
# 直接替换:将 OpenAI 改为智谱的 Embeddings 类 from langchain_community.embeddings import ZhipuAIEmbeddings # 定义嵌入模型(1:1 对应你的 OpenAI 写法) embeddings = ZhipuAIEmbeddings( model="embedding-3", # 对应 OpenAI 的 text-embedding-3-large dimensions=1024 # 支持自定义维度:256/512/1024/2048 ) #将query转化成向量标识 query_vector = embeddings.embed_query("你好") print(f"embedding-3 向量维度: {len(query_vector)}") print(f"向量前5个数值: {query_vector[:5]}")三.向量存储(Vector Stores)
1.向量数据库介绍
链接:https://python.langchain.com/docs/integrations/vectorstores/
2.内存存储
from langchain_openai import OpenAIEmbeddings from langchain_core.vectorstores import InMemoryVectorStore # 定义嵌⼊模型 embeddings = OpenAIEmbeddings(model="text-embedding-3-large") # 内存存储初始化 vector_store = InMemoryVectorStore(embedding=embeddings)from langchain_community.document_loaders import UnstructuredMarkdownLoader from langchain_text_splitters import CharacterTextSplitter # ⽣成分割器 text_splitter = CharacterTextSplitter.from_tiktoken_encoder( encoding_name="cl100k_base", chunk_size=200, chunk_overlap=50 ) # 加载⽂档 data = UnstructuredMarkdownLoader("../Docs/Markdown/脚⼿架级微服务租房平台 Q&A.md").load() # 分割⽂档 documents = text_splitter.split_documents(data) # 添加⽂档 ids = vector_store.add_documents(documents=documents) print(f"共编排了{len(ids)}个⽂档索引") print(f"前3个⽂档的索引是:{ids[:3]}")from langchain_community.document_loaders import UnstructuredMarkdownLoader from langchain_community.embeddings import ZhipuAIEmbeddings from langchain_core.vectorstores import InMemoryVectorStore from langchain_text_splitters import CharacterTextSplitter embeddings = ZhipuAIEmbeddings( model="embedding-3", # 对应 OpenAI 的 text-embedding-3-large ) vector_store = InMemoryVectorStore(embedding=embeddings) # 加载 Markdown 文件 markdown_path = "../Docs/Markdown/脚手架级微服务租房平台Q&A.md" loader = UnstructuredMarkdownLoader(markdown_path) data = loader.load() # 分割文档 text_splitter = CharacterTextSplitter.from_tiktoken_encoder( encoding_name="cl100k_base", chunk_size=400, chunk_overlap=50 ) #文档列表 documents = text_splitter.split_documents(data) #存储文档到内存向量存储中 ids = vector_store.add_documents(documents) print(f"共有{len(documents)}个文档,编排了{len(ids)}个索引") print(f"前三个文档的索引:{ids[:3]}") # #根据索引获取文档 # doc_2 = vector_store.get_by_ids(ids[:2]) # print(doc_2) # # #删除文档 # vector_store.delete(ids=ids[:2]) # doc_2 = vector_store.get_by_ids(ids[:3]) # print(doc_2) #检索 search_dosc = vector_store.similarity_search(query="项目介绍",k=2) for doc in search_dosc: print("*" * 30) print(doc.page_content)from langchain_core.documents import Document def _filter_function(doc: Document) -> bool: return doc.metadata.get("source") == "hahaha" search_docs = vector_store.similarity_search( query="数据库表怎么设计的?", k=2, filter=_filter_function ) for doc in search_docs: print("*" * 30) print(doc.page_content)我们把source换成../Docs/markdown/....就又可以进行检索了
