- 带元数据的向量存储
- 向量相似度搜索和最大边际相关性搜索,支持元数据过滤选项
- 支持点积、余弦和欧氏距离度量
- 通过索引创建和近似最近邻搜索进行性能优化(即将推出)
设置
使用 LangChain 与 Db2 向量存储和搜索的前提条件
安装包langchain-db2,这是 Db2 LangChain 向量存储和搜索的集成包。
安装该包时会同时安装其依赖项,如 langchain-core 和 ibm_db。
Copy
# pip install -U langchain-db2
连接到 Db2 向量存储
以下示例代码展示如何连接到 Db2 数据库。除上述依赖项外,您还需要一个正在运行的 Db2 数据库实例(版本 v12.1.2+ 以支持向量数据类型)。Copy
import ibm_db
import ibm_db_dbi
database = ""
username = ""
password = ""
try:
connection = ibm_db_dbi.connect(database, username, password)
print("Connection successful!")
except Exception as e:
print("Connection failed!")
导入所需依赖
Copy
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores.utils import DistanceStrategy
from langchain_core.documents import Document
from langchain_db2 import db2vs
from langchain_db2.db2vs import DB2VS
初始化
创建文档
Copy
# 定义文档列表
documents_json_list = [
{
"id": "doc_1_2_P4",
"text": "Db2 handles LOB data differently than other kinds of data. As a result, you sometimes need to take additional actions when you define LOB columns and insert the LOB data.",
"link": "https://www.ibm.com/docs/en/db2-for-zos/12?topic=programs-storing-lob-data-in-tables",
},
{
"id": "doc_11.1.0_P1",
"text": "Db2® column-organized tables add columnar capabilities to Db2 databases, which include data that is stored with column organization and vector processing of column data. Using this table format with star schema data marts provides significant improvements to storage, query performance, and ease of use through simplified design and tuning.",
"link": "https://www.ibm.com/docs/en/db2/11.1.0?topic=organization-column-organized-tables",
},
{
"id": "id_22.3.4.3.1_P2",
"text": "Data structures are elements that are required to use Db2®. You can access and use these elements to organize your data. Examples of data structures include tables, table spaces, indexes, index spaces, keys, views, and databases.",
"link": "https://www.ibm.com/docs/en/zos-basic-skills?topic=concepts-db2-data-structures",
},
{
"id": "id_3.4.3.1_P3",
"text": "Db2® maintains a set of tables that contain information about the data that Db2 controls. These tables are collectively known as the catalog. The catalog tables contain information about Db2 objects such as tables, views, and indexes. When you create, alter, or drop an object, Db2 inserts, updates, or deletes rows of the catalog that describe the object.",
"link": "https://www.ibm.com/docs/en/zos-basic-skills?topic=objects-db2-catalog",
},
]
Copy
# 创建 LangChain 文档
documents_langchain = []
for doc in documents_json_list:
metadata = {"id": doc["id"], "link": doc["link"]}
doc_langchain = Document(page_content=doc["text"], metadata=metadata)
documents_langchain.append(doc_langchain)
使用不同距离度量创建向量存储
首先,我们将分别使用不同的距离策略创建三个向量存储。 (您可以手动连接到 Db2 数据库,将看到三张表: Documents_DOT、Documents_COSINE 和 Documents_EUCLIDEAN。)Copy
# 使用不同距离策略创建 Db2 向量存储
# 使用我们的 API 调用时,建议先通过 from_documents() 用部分文档初始化向量存储,
# 然后使用 add_texts() 逐步添加更多文档。
# 这种方式可以防止系统过载,确保文档处理的高效性。
model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
vector_store_dot = DB2VS.from_documents(
documents_langchain,
model,
client=connection,
table_name="Documents_DOT",
distance_strategy=DistanceStrategy.DOT_PRODUCT,
)
vector_store_max = DB2VS.from_documents(
documents_langchain,
model,
client=connection,
table_name="Documents_COSINE",
distance_strategy=DistanceStrategy.COSINE,
)
vector_store_euclidean = DB2VS.from_documents(
documents_langchain,
model,
client=connection,
table_name="Documents_EUCLIDEAN",
distance_strategy=DistanceStrategy.EUCLIDEAN_DISTANCE,
)
管理向量存储
演示文本的添加和删除操作,以及基本的相似度搜索
Copy
def manage_texts(vector_stores):
"""
向每个向量存储添加文本,演示重复添加的错误处理,
并执行文本删除操作。展示每个向量存储的相似度搜索和索引创建。
参数:
- vector_stores (list): DB2VS 实例列表。
"""
texts = ["Rohan", "Shailendra"]
metadata = [
{"id": "100", "link": "Document Example Test 1"},
{"id": "101", "link": "Document Example Test 2"},
]
for i, vs in enumerate(vector_stores, start=1):
# 添加文本
try:
vs.add_texts(texts, metadata)
print(f"\n\n\nAdd texts complete for vector store {i}\n\n\n")
except Exception as ex:
print(f"\n\n\nExpected error on duplicate add for vector store {i}\n\n\n")
# 使用 'id' 值删除文本
vs.delete([metadata[0]["id"], metadata[1]["id"]])
print(f"\n\n\nDelete texts complete for vector store {i}\n\n\n")
# 相似度搜索
results = vs.similarity_search("How are LOBS stored in Db2 Database", 2)
print(f"\n\n\nSimilarity search results for vector store {i}: {results}\n\n\n")
vector_store_list = [
vector_store_dot,
vector_store_max,
vector_store_euclidean,
]
manage_texts(vector_store_list)
查询向量存储
演示向量存储上的高级搜索,包括带属性过滤和不带属性过滤的搜索
使用过滤时,我们仅选择文档 ID 为 101 的文档Copy
# 执行高级搜索
def conduct_advanced_searches(vector_stores):
query = "How are LOBS stored in Db2 Database"
# 构建过滤器,直接与文档元数据进行比较
# 此过滤器旨在包含元数据 'id' 精确等于 '101' 的文档
filter_criteria = {"id": ["101"]} # 直接比较过滤器
for i, vs in enumerate(vector_stores, start=1):
print(f"\n--- Vector Store {i} Advanced Searches ---")
# 不带过滤器的相似度搜索
print("\nSimilarity search results without filter:")
print(vs.similarity_search(query, 2))
# 带过滤器的相似度搜索
print("\nSimilarity search results with filter:")
print(vs.similarity_search(query, 2, filter=filter_criteria))
# 带相关性分数的相似度搜索
print("\nSimilarity search with relevance score:")
print(vs.similarity_search_with_score(query, 2))
# 带过滤器和相关性分数的相似度搜索
print("\nSimilarity search with relevance score with filter:")
print(vs.similarity_search_with_score(query, 2, filter=filter_criteria))
# 最大边际相关性搜索
print("\nMax marginal relevance search results:")
print(vs.max_marginal_relevance_search(query, 2, fetch_k=20, lambda_mult=0.5))
# 带过滤器的最大边际相关性搜索
print("\nMax marginal relevance search results with filter:")
print(
vs.max_marginal_relevance_search(
query, 2, fetch_k=20, lambda_mult=0.5, filter=filter_criteria
)
)
conduct_advanced_searches(vector_store_list)
通过 MCP 将这些文档连接 到 Claude、VSCode 等,获取实时解答。

