我们正与精选合作伙伴共同开发结构化生成的新接口。
需要 XML、FHIR、自定义模式或语法?欢迎与我们交流。
审计您的模式:分享一个模式,我们将展示在生成过程中哪些部分会出错、修复问题的约束条件,以及修复前后的合规率。请在此注册。

大语言模型(LLM)功能强大,但其输出难以预测。大多数解决方案试图在生成后通过解析、正则表达式或脆弱的代码来修复不良输出,但这些方法很容易失效。
Outlines 直接在生成过程中保证结构化输出——适用于任何 LLM。
model(prompt, output_type)
Outlines 遵循一个简单的模式,它反映了 Python 自身的类型系统。只需指定期望的输出类型,Outlines 将确保您的数据完全匹配该结构:
Literal["Yes", "No"]int开始使用 outlines 非常简单:
pip install outlines
import outlines
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
model = outlines.from_transformers(
AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
AutoTokenizer.from_pretrained(MODEL_NAME)
)
from typing import Literal
from pydantic import BaseModel
# 简单分类
sentiment = model(
"分析:'这个产品彻底改变了我的生活!'",
Literal["Positive", "Negative", "Neutral"]
)
print(sentiment) # "Positive"
# 提取特定类型
temperature = model("水的沸点是多少摄氏度?", int)
print(temperature) # 100
from pydantic import BaseModel
from enum import Enum
class Rating(Enum):
poor = 1
fair = 2
good = 3
excellent = 4
class ProductReview(BaseModel):
rating: Rating
pros: list[str]
cons: list[str]
summary: str
review = model(
"评论:XPS 13 电池续航出色,显示屏惊艳,但运行发热严重,且摄像头质量差。",
ProductReview,
max_new_tokens=200,
)
review = ProductReview.model_validate_json(review)
print(f"评分:{review.rating.name}") # "评分:good"
print(f"优点:{review.pros}") # "优点:['great battery life', 'stunning display']"
print(f"总结:{review.summary}") # "总结:Good laptop with great display but thermal issues"
以下是一些生产就绪的示例,展示了 Outlines 如何解决常见问题:
import outlines
from enum import Enum
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
model = outlines.from_transformers(
AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
AutoTokenizer.from_pretrained(MODEL_NAME)
)
def alert_manager(ticket):
print("警报!", ticket)
class TicketPriority(str, Enum):
low = "low"
medium = "medium"
high = "high"
urgent = "urgent"
class ServiceTicket(BaseModel):
priority: TicketPriority
category: str
requires_manager: bool
summary: str
action_items: List[str]
customer_email = """
主题:紧急 - 付款后无法访问我的账户
我 3 小时前购买了高级套餐,但仍然无法访问任何功能。
我已经尝试多次登出再登录。这令人无法接受,因为我一小时后有一个客户演示,需要用到分析仪表板。
请立即修复此问题或退还我的付款。
"""
prompt = f"""
<|im_start|>user
分析这封客户邮件:
{customer_email}
<|im_end|>
<|im_start|>assistant
"""
ticket = model(
prompt,
ServiceTicket,
max_new_tokens=500
)
# 使用结构化数据路由工单
ticket = ServiceTicket.model_validate_json(ticket)
if ticket.priority == "urgent" or ticket.requires_manager:
alert_manager(ticket)
import outlines
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM
from typing import List, Optional
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
model = outlines.from_transformers(
AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
AutoTokenizer.from_pretrained(MODEL_NAME)
)
def update_inventory(product, category, sub_category):
print(f"已将 {product.split(',')[0]} 更新到类别 {category}/{sub_category}")
class ProductCategory(BaseModel):
main_category: str
sub_category: str
attributes: List[str]
brand_match: Optional[str]
# 批量处理产品描述
product_descriptions = [
"Apple iPhone 15 Pro Max 256GB 钛金属,6.7 英寸 Super Retina XDR 显示屏,支持 ProMotion",
"有机棉 T 恤,男款中号,海军蓝,100% 可持续材料",
"KitchenAid 立式搅拌机,5 夸脱,红色,10 档速度设置,带揉面钩附件"
]
template = outlines.Template.from_string("""
<|im_start|>user
对此产品进行分类:
{{ description }}
<|im_end|>
<|im_start|>assistant
""")
# 获取所有产品的结构化分类
categories = model(
[template(description=desc) for desc in product_descriptions],
ProductCategory,
max_new_tokens=200
)
# 将分类用于库存管理
categories = [
ProductCategory.model_validate_json(category) for category in categories
]
for product, category in zip(product_descriptions, categories):
update_inventory(product, category.main_category, category.sub_category)
import outlines
from typing import Union, List, Literal
from pydantic import BaseModel
from enum import Enum
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
model = outlines.from_transformers(
AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
AutoTokenizer.from_pretrained(MODEL_NAME)
)
class EventType(str, Enum):
conference = "conference"
webinar = "webinar"
workshop = "workshop"
meetup = "meetup"
other = "other"
class EventInfo(BaseModel):
"""关于技术活动的结构化信息"""
name: str
date: str
location: str
event_type: EventType
topics: List[str]
registration_required: bool
# 创建一个联合类型,可以是结构化的 EventInfo 或 "I don't know"
EventResponse = Union[EventInfo, Literal["I don't know"]]
# 示例活动描述
event_descriptions = [
# 完整信息
"""
欢迎参加 DevCon 2023,这是将于 2023 年 11 月 15-17 日在旧金山会议中心举行的顶级开发者大会。
主题包括 AI/ML、云基础设施和 web3。需要注册。
""",
# 信息不足
"""
下周的技术活动。更多详情即将公布!
"""
]
# 处理活动
results = []
for description in event_descriptions:
prompt = f"""
<|im_start>system
你是一个乐于助人的助手
<|im_end|>
<|im_start>user
提取此技术活动的结构化信息:
{description}
如果有足够的信息,请返回包含以下字段的 JSON 对象:
- name: 活动名称
- date: 活动举办日期
- location: 活动举办地点
- event_type: 'conference', 'webinar', 'workshop', 'meetup' 或 'other' 之一
- topics: 活动主题列表
- registration_required: 表示是否需要注册的布尔值
如果可用信息不足以填写此 JSON,并且仅在这种情况下,请回答“I don't know”。
<|im_end|>
<|im_start|>assistant
"""
# 联合类型允许模型返回结构化数据或 "I don't know"
result = model(prompt, EventResponse, max_new_tokens=200)
results.append(result)
# 显示结果
for i, result in enumerate(results):
print(f"活动 {i+1}:")
if isinstance(result, str):
print(f" {result}")
else:
# 这是一个 EventInfo 对象
print(f" 名称:{result.name}")
print(f" 类型:{result.event_type}")
print(f" 日期:{result.date}")
print(f" 主题:{', '.join(result.topics)}")
print()
# 在下游处理中使用结构化数据
structured_count = sum(1 for r in results if isinstance(r, EventInfo))
print(f"成功为 {structured_count} 个活动(共 {len(results)} 个)提取了数据")
import outlines
from typing import Literal, List
import pandas as pd
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
model = outlines.from_transformers(
AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
AutoTokenizer.from_pretrained(MODEL_NAME)
)
# 使用 Literal 定义分类类别
DocumentCategory = Literal[
"Financial Report",
"Legal Contract",
"Technical Documentation",
"Marketing Material",
"Personal Correspondence"
]
# 待分类的示例文档
documents = [
"第三季度财务摘要:收入同比增长 15% 至 1240 万美元。EBITDA 利润率提高至 23%,而去年第三季度为 19%。运营费用...",
"本协议由甲方和乙方(以下合称“双方”)于...日签订",
"API 接受带有 JSON 负载的 POST 请求。必需参数包括 'user_id' 和 'transaction_type'。端点成功时返回 200 状态码。"
]
template = outlines.Template.from_string("""
<|im_start|>user
将以下文档精确分类到以下类别之一:
- Financial Report
- Legal Contract
- Technical Documentation
- Marketing Material
- Personal Correspondence
文档:
{{ document }}
<|im_end|>
<|im_start|>assistant
""")
# 分类文档
def classify_documents(texts: List[str]) -> List[DocumentCategory]:
results = []
for text in texts:
prompt = template(document=text)
# 模型必须返回预定义类别之一
category = model(prompt, DocumentCategory, max_new_tokens=200)
results.append(category)
return results
# 执行分类
classifications = classify_documents(documents)
# 创建简单的结果表
results_df = pd.DataFrame({
"文档": [doc[:50] + "..." for doc in documents],
"分类": classifications
})
print(results_df)
# 按类别统计文档数量
category_counts = pd.Series(classifications).value_counts()
print("\n类别分布:")
print(category_counts)
import outlines
import json
from typing import List, Optional
from datetime import date
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_NAME = "microsoft/phi-4"
model = outlines.from_transformers(
AutoModelForCausalLM.from_pretrained(MODEL_NAME, device_map="auto"),
AutoTokenizer.from_pretrained(MODEL_NAME)
)
# 定义具有类型化参数的函数
def schedule_meeting(
title: str,
date: date,
duration_minutes: int,
attendees: List[str],
location: Optional[str] = None,
agenda_items: Optional[List[str]] = None
):
"""使用指定详情安排会议"""
# 在实际应用中,这将创建会议
meeting = {
"title": title,
"date": date,
"duration_minutes": duration_minutes,
"attendees": attendees,
"location": location,
"agenda_items": agenda_items
}
return f"会议 '{title}' 已安排于 {date},与会者 {len(attendees)} 人"
# 自然语言请求
user_request = """
我需要在下周二下午 2 点为工程团队安排一次产品路线图评审会议。
会议应持续 90 分钟。请邀请 john@example.com、
sarah@example.com 和产品团队 product@example.com。
"""
# Outlines 自动从函数签名推断所需结构
prompt = f"""
<|im_start|>user
从此请求中提取会议详情:
{user_request}
<|im_end|>
<|im_start|>assistant
"""
meeting_params = model(prompt, schedule_meeting, max_new_tokens=200)
# 结果是一个匹配函数参数的字典
meeting_params = json.loads(meeting_params)
print(meeting_params)
# 使用提取的参数调用函数
result = schedule_meeting(**meeting_params)
print(result)
# "会议 'Product Roadmap Review' 已安排于 2023-10-17,与会者 3 人"