import openaiimport pytestfrom langsmith import wrappersfrom langsmith import testing as toai_client = wrappers.wrap_openai(openai.OpenAI())@pytest.mark.langsmithdef test_offtopic_input() -> None: user_query = "whats up" t.log_inputs({"user_query": user_query}) sql = generate_sql(user_query) t.log_outputs({"sql": sql}) expected = "Sorry that is not a valid query." t.log_reference_outputs({"sql": expected}) # Use this context manager to trace any steps used for generating evaluation # feedback separately from the main application logic with t.trace_feedback(): instructions = ( "Return 1 if the ACTUAL and EXPECTED answers are semantically equivalent, " "otherwise return 0. Return only 0 or 1 and nothing else." ) grade = oai_client.chat.completions.create( model="gpt-5.4-mini", messages=[ {"role": "system", "content": instructions}, {"role": "user", "content": f"ACTUAL: {sql}\nEXPECTED: {expected}"}, ], ) score = float(grade.choices[0].message.content) t.log_feedback(key="correct", score=score) assert score
@pytest.mark.langsmith(output_keys=["expected_sql"])@pytest.mark.parametrize( "user_query, expected_sql", [ ("Get all users from the customers table", "SELECT * FROM customers"), ("Get all users from the orders table", "SELECT * FROM orders"), ],)def test_sql_generation_parametrized(user_query, expected_sql): sql = generate_sql(user_query) assert sql == expected_sql
from langsmith import expect@pytest.mark.langsmithdef test_sql_generation_select_all(): user_query = "Get all users from the customers table" sql = generate_sql(user_query) expect(sql).to_contain("customers")
@pytest.mark.langsmith(output_keys=["expectation"])@pytest.mark.parametrize( "query, expectation", [ ("what's the capital of France?", "Paris"), ],)def test_embedding_similarity(query, expectation): prediction = my_chatbot(query) expect.embedding_distance( # This step logs the distance as feedback for this run prediction=prediction, expectation=expectation # Adding a matcher (in this case, 'to_be_*"), logs 'expectation' feedback ).to_be_less_than(0.5) # Optional predicate to assert against expect.edit_distance( # This computes the normalized Damerau-Levenshtein distance between the two strings prediction=prediction, expectation=expectation # If no predicate is provided below, 'assert' isn't called, but the score is still logged )
此测试用例将被分配 4 个分数:
预测和期望之间的 embedding_distance
二元 expectation 分数(如果余弦距离小于 0.5 则为 1,否则为 0)
预测和期望之间的 edit_distance
整体测试通过/失败分数(二元)
expect 工具是基于 Jest 的 expect API 建模的,具有一些现成功能,使评分 LLM 更容易。