发送请求#
本笔记本提供了安装后使用 SGLang 进行聊天补全的快速入门指南。
对于视觉语言模型,请参见 OpenAI APIs - Vision。
对于嵌入模型,请参见 OpenAI APIs - Embedding 和 Encode (embedding model)。
对于奖励模型,请参见 Classify (reward model)。
启动服务器#
[ ]:
from sglang.test.doc_patch import launch_server_cmd
from sglang.utils import wait_for_server, print_highlight, terminate_process
# 这等同于在您的终端中运行以下命令
# python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct --host 0.0.0.0
server_process, port = launch_server_cmd(
"""
python3 -m sglang.launch_server --model-path qwen/qwen2.5-0.5b-instruct \
--host 0.0.0.0 --log-level warning
"""
)
wait_for_server(f"http://localhost:{port}")
使用 cURL#
[ ]:
import subprocess, json
curl_command = f"""
curl -s http://localhost:{port}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{{"model": "qwen/qwen2.5-0.5b-instruct", "messages": [{{"role": "user", "content": "What is the capital of France?"}}]}}'
"""
response = json.loads(subprocess.check_output(curl_command, shell=True))
print_highlight(response)
使用 Python Requests#
[ ]:
import requests
url = f"http://localhost:{port}/v1/chat/completions"
data = {
"model": "qwen/qwen2.5-0.5b-instruct",
"messages": [{"role": "user", "content": "What is the capital of France?"}],
}
response = requests.post(url, json=data)
print_highlight(response.json())
使用 OpenAI Python 客户端#
[ ]:
import openai
client = openai.Client(base_url=f"http://127.0.0.1:{port}/v1", api_key="None")
response = client.chat.completions.create(
model="qwen/qwen2.5-0.5b-instruct",
messages=[
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=0,
max_tokens=64,
)
print_highlight(response)
流式输出#
[ ]:
import openai
client = openai.Client(base_url=f"http://127.0.0.1:{port}/v1", api_key="None")
# 使用 stream=True 获取流式响应
response = client.chat.completions.create(
model="qwen/qwen2.5-0.5b-instruct",
messages=[
{"role": "user", "content": "List 3 countries and their capitals."},
],
temperature=0,
max_tokens=64,
stream=True,
)
# 处理流式输出
for chunk in response:
if chunk.choices[0].delta.content:
print(chunk.choices[0].delta.content, end="", flush=True)
使用原生生成 API#
您也可以使用原生的 /generate 端点与 requests 交互,它提供了更大的灵活性。API 参考请参见 采样参数。
[ ]:
import requests
response = requests.post(
f"http://localhost:{port}/generate",
json={
"text": "The capital of France is",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 32,
},
},
)
print_highlight(response.json())
流式输出#
[ ]:
import requests, json
response = requests.post(
f"http://localhost:{port}/generate",
json={
"text": "The capital of France is",
"sampling_params": {
"temperature": 0,
"max_new_tokens": 32,
},
"stream": True,
},
stream=True,
)
prev = 0
for chunk in response.iter_lines(decode_unicode=False):
chunk = chunk.decode("utf-8")
if chunk and chunk.startswith("data:"):
if chunk == "data: [DONE]":
break
data = json.loads(chunk[5:].strip("\n"))
output = data["text"]
print(output[prev:], end="", flush=True)
prev = len(output)
[ ]:
terminate_process(server_process)