结构化输出#
您可以指定 JSON 模式、正则表达式 或 EBNF 来约束模型输出。模型输出将保证遵循给定的约束。每个请求只能指定一个约束参数(json_schema、regex 或 ebnf)。
SGLang 支持三种语法后端:
XGrammar(默认):支持 JSON 模式、正则表达式和 EBNF 约束。
Outlines:支持 JSON 模式和正则表达式约束。
Llguidance:支持 JSON 模式、正则表达式和 EBNF 约束。
我们建议使用 XGrammar,因为它具有更好的性能和实用性。XGrammar 目前使用 GGML BNF 格式。更多详情,请参见 XGrammar 技术概述。
要使用 Outlines,只需在启动服务器时添加 --grammar-backend outlines。 要使用 llguidance,请在启动服务器时添加 --grammar-backend llguidance。 如果未指定后端,将默认使用 XGrammar。
为了获得更好的输出质量,建议在提示中明确包含指令,以指导模型生成所需的格式。 例如,您可以指定"请按以下 JSON 格式生成输出:…"。
兼容 OpenAI 的 API#
[ ]:
import openai
import os
from sglang.test.doc_patch import launch_server_cmd
from sglang.utils import wait_for_server, print_highlight, terminate_process
os.environ["TOKENIZERS_PARALLELISM"] = "false"
server_process, port = launch_server_cmd(
"python -m sglang.launch_server --model-path meta-llama/Meta-Llama-3.1-8B-Instruct --host 0.0.0.0 --log-level warning"
)
wait_for_server(f"http://localhost:{port}")
client = openai.Client(base_url=f"http://127.0.0.1:{port}/v1", api_key="None")
JSON#
您可以直接定义 JSON 模式,或使用 Pydantic 定义和验证响应。
使用 Pydantic
[ ]:
from pydantic import BaseModel, Field
# 使用 Pydantic 定义模式
class CapitalInfo(BaseModel):
name: str = Field(..., pattern=r"^\w+$", description="首都城市的名称")
population: int = Field(..., description="首都城市的人口")
response = client.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
messages=[
{
"role": "user",
"content": "请以 JSON 格式生成法国首都的信息。",
},
],
temperature=0,
max_tokens=128,
response_format={
"type": "json_schema",
"json_schema": {
"name": "foo",
# 将 pydantic 模型转换为 json 模式
"schema": CapitalInfo.model_json_schema(),
},
},
)
response_content = response.choices[0].message.content
# 使用 pydantic 模型验证 JSON 响应
capital_info = CapitalInfo.model_validate_json(response_content)
print_highlight(f"已验证的响应: {capital_info.model_dump_json()}")
直接使用 JSON 模式
[ ]:
import json
json_schema = json.dumps(
{
"type": "object",
"properties": {
"name": {"type": "string", "pattern": "^[\\w]+$"},
"population": {"type": "integer"},
},
"required": ["name", "population"],
}
)
response = client.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
messages=[
{
"role": "user",
"content": "请以 JSON 格式提供法国首都的信息。",
},
],
temperature=0,
max_tokens=128,
response_format={
"type": "json_schema",
"json_schema": {"name": "foo", "schema": json.loads(json_schema)},
},
)
print_highlight(response.choices[0].message.content)
EBNF#
[ ]:
ebnf_grammar = """
root ::= city | description
city ::= "London" | "Paris" | "Berlin" | "Rome"
description ::= city " is " status
status ::= "the capital of " country
country ::= "England" | "France" | "Germany" | "Italy"
"""
response = client.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
messages=[
{"role": "system", "content": "你是一个有用的地理机器人。"},
{
"role": "user",
"content": "请提供法国首都的信息。",
},
],
temperature=0,
max_tokens=32,
extra_body={"ebnf": ebnf_grammar},
)
print_highlight(response.choices[0].message.content)
正则表达式#
[ ]:
response = client.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
messages=[
{"role": "user", "content": "法国的首都是什么?"},
],
temperature=0,
max_tokens=128,
extra_body={"regex": "(Paris|London)"},
)
print_highlight(response.choices[0].message.content)
结构化标签#
[ ]:
tool_get_current_weather = {
"type": "function",
"function": {
"name": "get_current_weather",
"description": "获取给定位置的当前天气",
"parameters": {
"type": "object",
"properties": {
"city": {
"type": "string",
"description": "要查找天气的城市,例如 'San Francisco'",
},
"state": {
"type": "string",
"description": "城市所在州的两字母缩写",
"例如 'CA' 代表 'California'",
},
"unit": {
"type": "string",
"description": "获取温度的单位",
"enum": ["celsius", "fahrenheit"],
},
},
"required": ["city", "state", "unit"],
},
},
}
tool_get_current_date = {
"type": "function",
"function": {
"name": "get_current_date",
"description": "获取给定时区的当前日期和时间",
"parameters": {
"type": "object",
"properties": {
"timezone": {
"type": "string",
"description": "获取当前日期和时间的时区,例如 'America/New_York'",
}
},
"required": ["timezone"],
},
},
}
schema_get_current_weather = tool_get_current_weather["function"]["parameters"]
schema_get_current_date = tool_get_current_date["function"]["parameters"]
def get_messages():
return [
{
"role": "system",
"content": f"""
# 工具指令
- 总是在您分享的消息中执行 python 代码。
- 当查找实时信息时,使用相关函数(如果可用),否则回退到 brave_search
您可以访问以下函数:
使用函数 'get_current_weather' 来:获取给定位置的当前天气
{tool_get_current_weather["function"]}
使用函数 'get_current_date' 来:获取给定时区的当前日期和时间
{tool_get_current_date["function"]}
如果您选择调用函数,仅以以下格式回复:
<{{start_tag}}={{function_name}}>{{parameters}}{{end_tag}}
其中
start_tag => `<function`
parameters => 以函数参数名为键,函数参数值为值的 JSON 字典。
end_tag => `</function>`
这是一个示例,
<function=example_function_name>{{"example_name": "example_value"}}</function>
提醒:
- 函数调用必须遵循指定的格式
- 必须指定必需参数
- 一次只能调用一个函数
- 将整个函数调用回复放在一行上
- 使用搜索结果回答用户查询时,始终注明您的来源
您是一个有用的助手。""",
},
{
"role": "user",
"content": "您在纽约。请获取当前的日期和时间以及天气。",
},
]
messages = get_messages()
response = client.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
messages=messages,
response_format={
"type": "structural_tag",
"structures": [
{
"begin": "<function=get_current_weather>",
"schema": schema_get_current_weather,
"end": "</function>",
},
{
"begin": "<function=get_current_date>",
"schema": schema_get_current_date,
"end": "</function>",
},
],
"triggers": ["<function="],
},
)
print_highlight(response.choices[0].message.content)
[ ]:
# 支持 XGrammar 最新结构化标签格式
# https://xgrammar.mlc.ai/docs/tutorials/structural_tag.html
response = client.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
messages=messages,
response_format={
"type": "structural_tag",
"format": {
"type": "triggered_tags",
"triggers": ["<function="],
"tags": [
{
"begin": "<function=get_current_weather>",
"content": {
"type": "json_schema",
"json_schema": schema_get_current_weather,
},
"end": "</function>",
},
{
"begin": "<function=get_current_date>",
"content": {
"type": "json_schema",
"json_schema": schema_get_current_date,
},
"end": "</function>",
},
],
"at_least_one": False,
"stop_after_first": False,
},
},
)
print_highlight(response.choices[0].message.content)
原生 API 和 SGLang 运行时 (SRT)#
JSON#
使用 Pydantic
[ ]:
import requests
import json
from pydantic import BaseModel, Field
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
# 使用 Pydantic 定义模式
class CapitalInfo(BaseModel):
name: str = Field(..., pattern=r"^\w+$", description="首都城市的名称")
population: int = Field(..., description="首都城市的人口")
# 发送 API 请求
messages = [
{
"role": "user",
"content": "这是法国首都的信息,以 JSON 格式呈现。\n",
}
]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, return_dict=False
)
response = requests.post(
f"http://localhost:{port}/generate",
json={
"text": text,
"sampling_params": {
"temperature": 0,
"max_new_tokens": 64,
"json_schema": json.dumps(CapitalInfo.model_json_schema()),
},
},
)
print_highlight(response.json())
response_data = json.loads(response.json()["text"])
# 使用 pydantic 模型验证响应
capital_info = CapitalInfo.model_validate(response_data)
print_highlight(f"已验证的响应: {capital_info.model_dump_json()}")
直接使用 JSON 模式
[ ]:
json_schema = json.dumps(
{
"type": "object",
"properties": {
"name": {"type": "string", "pattern": "^[\\w]+$"},
"population": {"type": "integer"},
},
"required": ["name", "population"],
}
)
# JSON
response = requests.post(
f"http://localhost:{port}/generate",
json={
"text": text,
"sampling_params": {
"temperature": 0,
"max_new_tokens": 64,
"json_schema": json_schema,
},
},
)
print_highlight(response.json())
EBNF#
[ ]:
messages = [
{
"role": "user",
"content": "请提供法国首都的信息。",
}
]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, return_dict=False
)
response = requests.post(
f"http://localhost:{port}/generate",
json={
"text": text,
"sampling_params": {
"max_new_tokens": 128,
"temperature": 0,
"n": 3,
"ebnf": (
"root ::= city | description\n"
'city ::= "London" | "Paris" | "Berlin" | "Rome"\n'
'description ::= city " is " status\n'
'status ::= "the capital of " country\n'
'country ::= "England" | "France" | "Germany" | "Italy"'
),
},
"stream": False,
"return_logprob": False,
},
)
print_highlight(response.json())
正则表达式#
[ ]:
messages = [
{
"role": "user",
"content": "巴黎是的首都",
}
]
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, return_dict=False
)
response = requests.post(
f"http://localhost:{port}/generate",
json={
"text": text,
"sampling_params": {
"temperature": 0,
"max_new_tokens": 64,
"regex": "(France|England)",
},
},
)
print_highlight(response.json())
结构化标签#
[ ]:
from transformers import AutoTokenizer
# 生成答案
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct")
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, return_dict=False
)
payload = {
"text": text,
"sampling_params": {
"structural_tag": json.dumps(
{
"type": "structural_tag",
"structures": [
{
"begin": "<function=get_current_weather>",
"schema": schema_get_current_weather,
"end": "</function>",
},
{
"begin": "<function=get_current_date>",
"schema": schema_get_current_date,
"end": "</function>",
},
],
"triggers": ["<function="],
}
)
},
}
# 向 API 端点发送 POST 请求
response = requests.post(f"http://localhost:{port}/generate", json=payload)
print_highlight(response.json())
[ ]:
# 支持 XGrammar 最新结构化标签格式
# https://xgrammar.mlc.ai/docs/tutorials/structural_tag.html
payload = {
"text": text,
"sampling_params": {
"structural_tag": json.dumps(
{
"type": "structural_tag",
"format": {
"type": "triggered_tags",
"triggers": ["<function="],
"tags": [
{
"begin": "<function=get_current_weather>",
"content": {
"type": "json_schema",
"json_schema": schema_get_current_weather,
},
"end": "</function>",
},
{
"begin": "<function=get_current_date>",
"content": {
"type": "json_schema",
"json_schema": schema_get_current_date,
},
"end": "</function>",
},
],
"at_least_one": False,
"stop_after_first": False,
},
}
)
},
}
# 向 API 端点发送 POST 请求
response = requests.post(f"http://localhost:{port}/generate", json=payload)
print_highlight(response.json())
[ ]:
terminate_process(server_process)
离线引擎 API#
[ ]:
import sglang as sgl
llm = sgl.Engine(
model_path="meta-llama/Meta-Llama-3.1-8B-Instruct", grammar_backend="xgrammar"
)
JSON#
使用 Pydantic
[ ]:
import json
from pydantic import BaseModel, Field
prompts = [
"请以 JSON 格式提供中国首都的信息。",
"请以 JSON 格式提供法国首都的信息。",
"请以 JSON 格式提供爱尔兰首都的信息。",
]
# 使用 Pydantic 定义模式
class CapitalInfo(BaseModel):
name: str = Field(..., pattern=r"^\w+$", description="首都城市的名称")
population: int = Field(..., description="首都城市的人口")
sampling_params = {
"temperature": 0.1,
"top_p": 0.95,
"json_schema": json.dumps(CapitalInfo.model_json_schema()),
}
outputs = llm.generate(prompts, sampling_params)
for prompt, output in zip(prompts, outputs):
print_highlight("===============================")
print_highlight(f"提示: {prompt}") # 使用 pydantic 模型验证输出
capital_info = CapitalInfo.model_validate_json(output["text"])
print_highlight(f"已验证的输出: {capital_info.model_dump_json()}")
直接使用 JSON 模式
[ ]:
prompts = [
"请以 JSON 格式提供中国首都的信息。",
"请以 JSON 格式提供法国首都的信息。",
"请以 JSON 格式提供爱尔兰首都的信息。",
]
json_schema = json.dumps(
{
"type": "object",
"properties": {
"name": {"type": "string", "pattern": "^[\\w]+$"},
"population": {"type": "integer"},
},
"required": ["name", "population"],
}
)
sampling_params = {"temperature": 0.1, "top_p": 0.95, "json_schema": json_schema}
outputs = llm.generate(prompts, sampling_params)
for prompt, output in zip(prompts, outputs):
print_highlight("===============================")
print_highlight(f"提示: {prompt}\n生成的文本: {output['text']}")
EBNF#
[ ]:
prompts = [
"请提供法国首都的信息。",
"请提供德国首都的信息。",
"请提供意大利首都的信息。",
]
sampling_params = {
"temperature": 0.8,
"top_p": 0.95,
"ebnf": (
"root ::= city | description\n"
'city ::= "London" | "Paris" | "Berlin" | "Rome"\n'
'description ::= city " is " status\n'
'status ::= "the capital of " country\n'
'country ::= "England" | "France" | "Germany" | "Italy"'
),
}
outputs = llm.generate(prompts, sampling_params)
for prompt, output in zip(prompts, outputs):
print_highlight("===============================")
print_highlight(f"提示: {prompt}\n生成的文本: {output['text']}")
正则表达式#
[ ]:
prompts = [
"请提供作为全球主要城市的伦敦的信息:",
"请提供作为全球主要城市的巴黎的信息:",
]
sampling_params = {"temperature": 0.8, "top_p": 0.95, "regex": "(France|England)"}
outputs = llm.generate(prompts, sampling_params)
for prompt, output in zip(prompts, outputs):
print_highlight("===============================")
print_highlight(f"提示: {prompt}\n生成的文本: {output['text']}")
结构化标签#
[ ]:
text = tokenizer.apply_chat_template(
messages, tokenize=False, add_generation_prompt=True, return_dict=False
)
prompts = [text]
sampling_params = {
"temperature": 0.8,
"top_p": 0.95,
"structural_tag": json.dumps(
{
"type": "structural_tag",
"structures": [
{
"begin": "<function=get_current_weather>",
"schema": schema_get_current_weather,
"end": "</function>",
},
{
"begin": "<function=get_current_date>",
"schema": schema_get_current_date,
"end": "</function>",
},
],
"triggers": ["<function="],
}
),
}
# 向 API 端点发送 POST 请求
outputs = llm.generate(prompts, sampling_params)
for prompt, output in zip(prompts, outputs):
print_highlight("===============================")
print_highlight(f"提示: {prompt}\n生成的文本: {output['text']}")
[ ]:
# 支持 XGrammar 最新结构化标签格式
# https://xgrammar.mlc.ai/docs/tutorials/structural_tag.html
sampling_params = {
"temperature": 0.8,
"top_p": 0.95,
"structural_tag": json.dumps(
{
"type": "structural_tag",
"format": {
"type": "triggered_tags",
"triggers": ["<function="],
"tags": [
{
"begin": "<function=get_current_weather>",
"content": {
"type": "json_schema",
"json_schema": schema_get_current_weather,
},
"end": "</function>",
},
{
"begin": "<function=get_current_date>",
"content": {
"type": "json_schema",
"json_schema": schema_get_current_date,
},
"end": "</function>",
},
],
"at_least_one": False,
"stop_after_first": False,
},
}
),
}
# 向 API 端点发送 POST 请求
outputs = llm.generate(prompts, sampling_params)
for prompt, output in zip(prompts, outputs):
print_highlight("===============================")
print_highlight(f"提示: {prompt}\n生成的文本: {output['text']}")
[ ]:
llm.shutdown()