specification-extractor：从文档中提取结构化规范数据

astral · 2026-02-05 10:12:03 · 64 次点击 · 0 条评论

名称： "specification-extractor"
描述： "从施工规范文件中提取结构化数据。解析 CSI 章节、要求、提交项和产品数据。"
主页： "https://datadrivenconstruction.io"
元数据： {"openclaw": {"emoji": "📑", "os": ["darwin", "linux", "win32"], "homepage": "https://datadrivenconstruction.io", "requires": {"bins": ["python3"]}}}

施工规范提取器

概述

从施工规范文件中提取结构化数据。解析 CSI MasterFormat 章节，识别要求、提交项、产品标准，并为估算和采购编译可操作的数据。

业务价值

自动化规范提取可实现：
- 快速估算：快速识别工作范围和具体要求
- 采购准确性：提取精确的产品规格
- 提交项跟踪：识别所有必需的提交项
- 合规性检查：对照标准验证规范

技术实现

from dataclasses import dataclass, field
from typing import List, Dict, Any, Optional
import re
import pdfplumber
from pathlib import Path

@dataclass
class SpecSection:
    number: str  # 例如："03 30 00"
    title: str
    part1_general: Dict[str, Any]
    part2_products: Dict[str, Any]
    part3_execution: Dict[str, Any]
    raw_text: str

@dataclass
class ProductRequirement:
    section: str
    manufacturer: str
    product_name: str
    model: str
    standards: List[str]
    properties: Dict[str, str]

@dataclass
class SubmittalRequirement:
    section: str
    submittal_type: str  # 施工图、样品、产品数据等
    description: str
    timing: str
    copies: int

@dataclass
class SpecExtractionResult:
    document_name: str
    total_pages: int
    sections: List[SpecSection]
    products: List[ProductRequirement]
    submittals: List[SubmittalRequirement]
    standards_referenced: List[str]

class SpecificationExtractor:
    """从施工规范中提取结构化数据。"""

    # CSI MasterFormat 模式
    CSI_SECTION_PATTERN = r'^(\d{2}\s?\d{2}\s?\d{2})\s*[-–]\s*(.+?)$'
    PART_PATTERN = r'^PART\s+(\d+)\s*[-–]\s*(.+?)$'
    ARTICLE_PATTERN = r'^(\d+\.\d+)\s+([A-Z][A-Z\s]+)$'

    # 提交项类型关键词
    SUBMITTAL_TYPES = {
        'shop drawings': '施工图',
        'product data': '产品数据',
        'samples': '样品',
        'certificates': '证书',
        'test reports': '测试报告',
        'manufacturer instructions': '制造商说明',
        'warranty': '保修',
        'maintenance data': '维护数据',
        'mock-ups': '实体模型',
    }

    # 常见标准组织
    STANDARD_PATTERNS = [
        r'ASTM\s+[A-Z]\d+',
        r'ANSI\s+[A-Z]?\d+',
        r'ACI\s+\d+',
        r'AISC\s+\d+',
        r'AWS\s+[A-Z]\d+',
        r'ASCE\s+\d+',
        r'UL\s+\d+',
        r'FM\s+\d+',
        r'NFPA\s+\d+',
        r'IBC\s+\d+',
    ]

    def __init__(self):
        self.sections: Dict[str, SpecSection] = {}

    def extract_from_pdf(self, pdf_path: str) -> SpecExtractionResult:
        """从 PDF 中提取规范数据。"""
        path = Path(pdf_path)

        all_text = ""
        page_count = 0

        with pdfplumber.open(pdf_path) as pdf:
            page_count = len(pdf.pages)
            for page in pdf.pages:
                text = page.extract_text() or ""
                all_text += text + "\n\n"

        # 解析章节
        sections = self._parse_sections(all_text)

        # 提取产品
        products = self._extract_products(sections)

        # 提取提交项
        submittals = self._extract_submittals(sections)

        # 提取标准
        standards = self._extract_standards(all_text)

        return SpecExtractionResult(
            document_name=path.name,
            total_pages=page_count,
            sections=sections,
            products=products,
            submittals=submittals,
            standards_referenced=standards
        )

    def _parse_sections(self, text: str) -> List[SpecSection]:
        """从规范文本中解析 CSI 章节。"""
        sections = []
        lines = text.split('\n')

        current_section = None
        current_part = None
        current_content = []

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # 检查章节标题
            section_match = re.match(self.CSI_SECTION_PATTERN, line, re.IGNORECASE)
            if section_match:
                # 保存前一章节
                if current_section:
                    sections.append(self._finalize_section(current_section, current_content))

                current_section = {
                    'number': section_match.group(1).replace(' ', ''),
                    'title': section_match.group(2).strip(),
                    'parts': {}
                }
                current_content = []
                current_part = None
                continue

            # 检查部分标题
            part_match = re.match(self.PART_PATTERN, line, re.IGNORECASE)
            if part_match and current_section:
                part_num = part_match.group(1)
                part_name = part_match.group(2).strip()
                current_part = f"part{part_num}"
                current_section['parts'][current_part] = {
                    'name': part_name,
                    'content': []
                }
                continue

            # 将内容添加到当前部分
            if current_section and current_part:
                current_section['parts'][current_part]['content'].append(line)
            elif current_section:
                current_content.append(line)

        # 保存最后一个章节
        if current_section:
            sections.append(self._finalize_section(current_section, current_content))

        return sections

    def _finalize_section(self, section_data: Dict, general_content: List[str]) -> SpecSection:
        """用解析的部分完成章节。"""
        parts = section_data.get('parts', {})

        part1 = self._parse_part_content(parts.get('part1', {}).get('content', []))
        part2 = self._parse_part_content(parts.get('part2', {}).get('content', []))
        part3 = self._parse_part_content(parts.get('part3', {}).get('content', []))

        return SpecSection(
            number=section_data['number'],
            title=section_data['title'],
            part1_general=part1,
            part2_products=part2,
            part3_execution=part3,
            raw_text='\n'.join(general_content)
        )

    def _parse_part_content(self, content: List[str]) -> Dict[str, Any]:
        """将部分内容解析为结构化数据。"""
        result = {
            'articles': {},
            'items': []
        }

        current_article = None

        for line in content:
            # 检查条款标题
            article_match = re.match(self.ARTICLE_PATTERN, line)
            if article_match:
                current_article = article_match.group(1)
                result['articles'][current_article] = {
                    'title': article_match.group(2),
                    'items': []
                }
                continue

            # 添加到当前条款或常规项目
            if current_article and current_article in result['articles']:
                result['articles'][current_article]['items'].append(line)
            else:
                result['items'].append(line)

        return result

    def _extract_products(self, sections: List[SpecSection]) -> List[ProductRequirement]:
        """从第 2 部分提取产品要求。"""
        products = []

        for section in sections:
            part2 = section.part2_products

            for article_num, article in part2.get('articles', {}).items():
                if 'MANUFACTURERS' in article['title'].upper():
                    for item in article['items']:
                        # 提取制造商名称
                        if item.strip().startswith(('A.', 'B.', 'C.', '1.', '2.', '3.')):
                            mfr_name = re.sub(r'^[A-Z\d]+\.\s*', '', item).strip()
                            products.append(ProductRequirement(
                                section=section.number,
                                manufacturer=mfr_name,
                                product_name='',
                                model='',
                                standards=[],
                                properties={}
                            ))

                elif 'MATERIALS' in article['title'].upper() or 'PRODUCTS' in article['title'].upper():
                    for item in article['items']:
                        # 提取材料要求
                        standards = self._extract_standards(item)
                        if standards:
                            products.append(ProductRequirement(
                                section=section.number,
                                manufacturer='',
                                product_name=item[:100],
                                model='',
                                standards=standards,
                                properties={}
                            ))

        return products

    def _extract_submittals(self, sections: List[SpecSection]) -> List[SubmittalRequirement]:
        """从第 1 部分提取提交项要求。"""
        submittals = []

        for section in sections:
            part1 = section.part1_general

            for article_num, article in part1.get('articles', {}).items():
                if 'SUBMITTAL' in article['title'].upper():
                    for item in article['items']:
                        item_lower = item.lower()

                        for keyword, submittal_type in self.SUBMITTAL_TYPES.items():
                            if keyword in item_lower:
                                submittals.append(SubmittalRequirement(
                                    section=section.number,
                                    submittal_type=submittal_type,
                                    description=item.strip(),
                                    timing='制造前',
                                    copies=3
                                ))
                                break

        return submittals

    def _extract_standards(self, text: str) -> List[str]:
        """从文本中提取引用的标准。"""
        standards = []

        for pattern in self.STANDARD_PATTERNS:
            matches = re.findall(pattern, text, re.IGNORECASE)
            standards.extend(matches)

        return list(set(standards))

    def generate_submittal_log(self, result: SpecExtractionResult) -> str:
        """根据提取结果生成提交项日志。"""
        lines = ["# 提交项日志", ""]
        lines.append(f"**项目规范：** {result.document_name}")
        lines.append(f"**总提交项：** {len(result.submittals)}")
        lines.append("")

        lines.append("| # | 章节 | 类型 | 描述 | 状态 |")
        lines.append("|---|---------|------|-------------|--------|")

        for i, sub in enumerate(result.submittals, 1):
            desc = sub.description[:50] + "..." if len(sub.description) > 50 else sub.description
            lines.append(f"| {i} | {sub.section} | {sub.submittal_type} | {desc} | 待处理 |")

        return "\n".join(lines)

    def generate_product_schedule(self, result: SpecExtractionResult) -> str:
        """根据提取结果生成产品计划表。"""
        lines = ["# 产品计划表", ""]

        # 按章节分组
        by_section = {}
        for prod in result.products:
            if prod.section not in by_section:
                by_section[prod.section] = []
            by_section[prod.section].append(prod)

        for section, products in sorted(by_section.items()):
            lines.append(f"## 章节 {section}")
            lines.append("")

            for prod in products:
                if prod.manufacturer:
                    lines.append(f"- **制造商：** {prod.manufacturer}")
                if prod.product_name:
                    lines.append(f"- **产品：** {prod.product_name}")
                if prod.standards:
                    lines.append(f"- **标准：** {', '.join(prod.standards)}")
                lines.append("")

        return "\n".join(lines)

    def generate_report(self, result: SpecExtractionResult) -> str:
        """生成全面的提取报告。"""
        lines = ["# 规范提取报告", ""]
        lines.append(f"**文档：** {result.document_name}")
        lines.append(f"**页数：** {result.total_pages}")
        lines.append(f"**找到的章节：** {len(result.sections)}")
        lines.append("")

        # 章节摘要
        lines.append("## 提取的章节")
        for section in result.sections:
            lines.append(f"- **{section.number}** - {section.title}")
        lines.append("")

        # 标准
        if result.standards_referenced:
            lines.append("## 引用的标准")
            for std in sorted(set(result.standards_referenced)):
                lines.append(f"- {std}")
            lines.append("")

        # 提交项摘要
        lines.append("## 必需的提交项")
        lines.append(f"总计：{len(result.submittals)}")
        by_type = {}
        for sub in result.submittals:
            by_type[sub.submittal_type] = by_type.get(sub.submittal_type, 0) + 1
        for t, count in sorted(by_type.items()):
            lines.append(f"- {t}：{count}")
        lines.append("")

        # 产品摘要
        lines.append("## 产品/制造商")
        lines.append(f"总计：{len(result.products)}")

        return "\n".join(lines)

快速开始

# 初始化提取器
extractor = SpecificationExtractor()

# 从 PDF 中提取
result = extractor.extract_from_pdf("Project_Specifications.pdf")

print(f"找到 {len(result.sections)} 个章节")
print(f"找到 {len(result.submittals)} 个提交项")
print(f"找到 {len(result.products)} 个产品要求")

# 生成提交项日志
submittal_log = extractor.generate_submittal_log(result)
print(submittal_log)

# 生成产品计划表
product_schedule = extractor.generate_product_schedule(result)
print(product_schedule)

# 完整报告
report = extractor.generate_report(result)
print(report)

依赖项

pip install pdfplumber

技能包地址：https://github.com/openclaw/skills/tree/main/skills/datadrivenconstruction/specification-extractor/SKILL.md

64 次点击 ∙ 0 人收藏

登录后收藏

0 条回复