import requests
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
def get_wikipedia_page(title: str):
"""
检索 Wikipedia 页面的完整文本内容。
:param title: str - Wikipedia 页面的标题。
:return: str - 页面的完整文本内容(原始字符串)。
"""
# Wikipedia API 端点
URL = "https://en.wikipedia.org/w/api.php"
# API 请求的参数
params = {
"action": "query",
"format": "json",
"titles": title,
"prop": "extracts",
"explaintext": True,
}
# 自定义 User-Agent header 以符合 Wikipedia 的最佳实践
headers = {"User-Agent": "RAGatouille_tutorial/0.0.1 (ben@clavie.eu)"}
response = requests.get(URL, params=params, headers=headers)
data = response.json()
# 提取页面内容
page = next(iter(data["query"]["pages"].values()))
return page["extract"] if "extract" in page else None
text = get_wikipedia_page("Hayao_Miyazaki")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
texts = text_splitter.create_documents([text])