名称: "specification-extractor"
描述: "从施工规范文件中提取结构化数据。解析 CSI 章节、要求、提交项和产品数据。"
主页: "https://datadrivenconstruction.io"
元数据: {"openclaw": {"emoji": "📑", "os": ["darwin", "linux", "win32"], "homepage": "https://datadrivenconstruction.io", "requires": {"bins": ["python3"]}}}
从施工规范文件中提取结构化数据。解析 CSI MasterFormat 章节,识别要求、提交项、产品标准,并为估算和采购编译可操作的数据。
自动化规范提取可实现:
- 快速估算:快速识别工作范围和具体要求
- 采购准确性:提取精确的产品规格
- 提交项跟踪:识别所有必需的提交项
- 合规性检查:对照标准验证规范
from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
import re
import pdfplumber
from pathlib import Path
@dataclass
class SpecSection:
number: str # 例如:"03 30 00"
title: str
part1_general: Dict[str, Any]
part2_products: Dict[str, Any]
part3_execution: Dict[str, Any]
raw_text: str
@dataclass
class ProductRequirement:
section: str
manufacturer: str
product_name: str
model: str
standards: List[str]
properties: Dict[str, str]
@dataclass
class SubmittalRequirement:
section: str
submittal_type: str # 施工图、样品、产品数据等
description: str
timing: str
copies: int
@dataclass
class SpecExtractionResult:
document_name: str
total_pages: int
sections: List[SpecSection]
products: List[ProductRequirement]
submittals: List[SubmittalRequirement]
standards_referenced: List[str]
class SpecificationExtractor:
"""从施工规范中提取结构化数据。"""
# CSI MasterFormat 模式
CSI_SECTION_PATTERN = r'^(\d{2}\s?\d{2}\s?\d{2})\s*[-–]\s*(.+?)$'
PART_PATTERN = r'^PART\s+(\d+)\s*[-–]\s*(.+?)$'
ARTICLE_PATTERN = r'^(\d+\.\d+)\s+([A-Z][A-Z\s]+)$'
# 提交项类型关键词
SUBMITTAL_TYPES = {
'shop drawings': '施工图',
'product data': '产品数据',
'samples': '样品',
'certificates': '证书',
'test reports': '测试报告',
'manufacturer instructions': '制造商说明',
'warranty': '保修',
'maintenance data': '维护数据',
'mock-ups': '实体模型',
}
# 常见标准组织
STANDARD_PATTERNS = [
r'ASTM\s+[A-Z]\d+',
r'ANSI\s+[A-Z]?\d+',
r'ACI\s+\d+',
r'AISC\s+\d+',
r'AWS\s+[A-Z]\d+',
r'ASCE\s+\d+',
r'UL\s+\d+',
r'FM\s+\d+',
r'NFPA\s+\d+',
r'IBC\s+\d+',
]
def __init__(self):
self.sections: Dict[str, SpecSection] = {}
def extract_from_pdf(self, pdf_path: str) -> SpecExtractionResult:
"""从 PDF 中提取规范数据。"""
path = Path(pdf_path)
all_text = ""
page_count = 0
with pdfplumber.open(pdf_path) as pdf:
page_count = len(pdf.pages)
for page in pdf.pages:
text = page.extract_text() or ""
all_text += text + "\n\n"
# 解析章节
sections = self._parse_sections(all_text)
# 提取产品
products = self._extract_products(sections)
# 提取提交项
submittals = self._extract_submittals(sections)
# 提取标准
standards = self._extract_standards(all_text)
return SpecExtractionResult(
document_name=path.name,
total_pages=page_count,
sections=sections,
products=products,
submittals=submittals,
standards_referenced=standards
)
def _parse_sections(self, text: str) -> List[SpecSection]:
"""从规范文本中解析 CSI 章节。"""
sections = []
lines = text.split('\n')
current_section = None
current_part = None
current_content = []
for line in lines:
line = line.strip()
if not line:
continue
# 检查章节标题
section_match = re.match(self.CSI_SECTION_PATTERN, line, re.IGNORECASE)
if section_match:
# 保存前一章节
if current_section:
sections.append(self._finalize_section(current_section, current_content))
current_section = {
'number': section_match.group(1).replace(' ', ''),
'title': section_match.group(2).strip(),
'parts': {}
}
current_content = []
current_part = None
continue
# 检查部分标题
part_match = re.match(self.PART_PATTERN, line, re.IGNORECASE)
if part_match and current_section:
part_num = part_match.group(1)
part_name = part_match.group(2).strip()
current_part = f"part{part_num}"
current_section['parts'][current_part] = {
'name': part_name,
'content': []
}
continue
# 将内容添加到当前部分
if current_section and current_part:
current_section['parts'][current_part]['content'].append(line)
elif current_section:
current_content.append(line)
# 保存最后一个章节
if current_section:
sections.append(self._finalize_section(current_section, current_content))
return sections
def _finalize_section(self, section_data: Dict, general_content: List[str]) -> SpecSection:
"""用解析的部分完成章节。"""
parts = section_data.get('parts', {})
part1 = self._parse_part_content(parts.get('part1', {}).get('content', []))
part2 = self._parse_part_content(parts.get('part2', {}).get('content', []))
part3 = self._parse_part_content(parts.get('part3', {}).get('content', []))
return SpecSection(
number=section_data['number'],
title=section_data['title'],
part1_general=part1,
part2_products=part2,
part3_execution=part3,
raw_text='\n'.join(general_content)
)
def _parse_part_content(self, content: List[str]) -> Dict[str, Any]:
"""将部分内容解析为结构化数据。"""
result = {
'articles': {},
'items': []
}
current_article = None
for line in content:
# 检查条款标题
article_match = re.match(self.ARTICLE_PATTERN, line)
if article_match:
current_article = article_match.group(1)
result['articles'][current_article] = {
'title': article_match.group(2),
'items': []
}
continue
# 添加到当前条款或常规项目
if current_article and current_article in result['articles']:
result['articles'][current_article]['items'].append(line)
else:
result['items'].append(line)
return result
def _extract_products(self, sections: List[SpecSection]) -> List[ProductRequirement]:
"""从第 2 部分提取产品要求。"""
products = []
for section in sections:
part2 = section.part2_products
for article_num, article in part2.get('articles', {}).items():
if 'MANUFACTURERS' in article['title'].upper():
for item in article['items']:
# 提取制造商名称
if item.strip().startswith(('A.', 'B.', 'C.', '1.', '2.', '3.')):
mfr_name = re.sub(r'^[A-Z\d]+\.\s*', '', item).strip()
products.append(ProductRequirement(
section=section.number,
manufacturer=mfr_name,
product_name='',
model='',
standards=[],
properties={}
))
elif 'MATERIALS' in article['title'].upper() or 'PRODUCTS' in article['title'].upper():
for item in article['items']:
# 提取材料要求
standards = self._extract_standards(item)
if standards:
products.append(ProductRequirement(
section=section.number,
manufacturer='',
product_name=item[:100],
model='',
standards=standards,
properties={}
))
return products
def _extract_submittals(self, sections: List[SpecSection]) -> List[SubmittalRequirement]:
"""从第 1 部分提取提交项要求。"""
submittals = []
for section in sections:
part1 = section.part1_general
for article_num, article in part1.get('articles', {}).items():
if 'SUBMITTAL' in article['title'].upper():
for item in article['items']:
item_lower = item.lower()
for keyword, submittal_type in self.SUBMITTAL_TYPES.items():
if keyword in item_lower:
submittals.append(SubmittalRequirement(
section=section.number,
submittal_type=submittal_type,
description=item.strip(),
timing='制造前',
copies=3
))
break
return submittals
def _extract_standards(self, text: str) -> List[str]:
"""从文本中提取引用的标准。"""
standards = []
for pattern in self.STANDARD_PATTERNS:
matches = re.findall(pattern, text, re.IGNORECASE)
standards.extend(matches)
return list(set(standards))
def generate_submittal_log(self, result: SpecExtractionResult) -> str:
"""根据提取结果生成提交项日志。"""
lines = ["# 提交项日志", ""]
lines.append(f"**项目规范:** {result.document_name}")
lines.append(f"**总提交项:** {len(result.submittals)}")
lines.append("")
lines.append("| # | 章节 | 类型 | 描述 | 状态 |")
lines.append("|---|---------|------|-------------|--------|")
for i, sub in enumerate(result.submittals, 1):
desc = sub.description[:50] + "..." if len(sub.description) > 50 else sub.description
lines.append(f"| {i} | {sub.section} | {sub.submittal_type} | {desc} | 待处理 |")
return "\n".join(lines)
def generate_product_schedule(self, result: SpecExtractionResult) -> str:
"""根据提取结果生成产品计划表。"""
lines = ["# 产品计划表", ""]
# 按章节分组
by_section = {}
for prod in result.products:
if prod.section not in by_section:
by_section[prod.section] = []
by_section[prod.section].append(prod)
for section, products in sorted(by_section.items()):
lines.append(f"## 章节 {section}")
lines.append("")
for prod in products:
if prod.manufacturer:
lines.append(f"- **制造商:** {prod.manufacturer}")
if prod.product_name:
lines.append(f"- **产品:** {prod.product_name}")
if prod.standards:
lines.append(f"- **标准:** {', '.join(prod.standards)}")
lines.append("")
return "\n".join(lines)
def generate_report(self, result: SpecExtractionResult) -> str:
"""生成全面的提取报告。"""
lines = ["# 规范提取报告", ""]
lines.append(f"**文档:** {result.document_name}")
lines.append(f"**页数:** {result.total_pages}")
lines.append(f"**找到的章节:** {len(result.sections)}")
lines.append("")
# 章节摘要
lines.append("## 提取的章节")
for section in result.sections:
lines.append(f"- **{section.number}** - {section.title}")
lines.append("")
# 标准
if result.standards_referenced:
lines.append("## 引用的标准")
for std in sorted(set(result.standards_referenced)):
lines.append(f"- {std}")
lines.append("")
# 提交项摘要
lines.append("## 必需的提交项")
lines.append(f"总计:{len(result.submittals)}")
by_type = {}
for sub in result.submittals:
by_type[sub.submittal_type] = by_type.get(sub.submittal_type, 0) + 1
for t, count in sorted(by_type.items()):
lines.append(f"- {t}:{count}")
lines.append("")
# 产品摘要
lines.append("## 产品/制造商")
lines.append(f"总计:{len(result.products)}")
return "\n".join(lines)
# 初始化提取器
extractor = SpecificationExtractor()
# 从 PDF 中提取
result = extractor.extract_from_pdf("Project_Specifications.pdf")
print(f"找到 {len(result.sections)} 个章节")
print(f"找到 {len(result.submittals)} 个提交项")
print(f"找到 {len(result.products)} 个产品要求")
# 生成提交项日志
submittal_log = extractor.generate_submittal_log(result)
print(submittal_log)
# 生成产品计划表
product_schedule = extractor.generate_product_schedule(result)
print(product_schedule)
# 完整报告
report = extractor.generate_report(result)
print(report)
pip install pdfplumber