Files
YG-Rules/app/utils/guidance_analysis.py

214 lines
9.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""政策指引文本分析。"""
from __future__ import annotations
import re
import shutil
import subprocess
import tempfile
from datetime import datetime
from pathlib import Path
class GuidanceAnalyzer:
GRANULARITY_MAP = {"low": "low", "high": "high", "coarse": "low", "medium": "high", "fine": "high"}
DOMAIN_KEYWORDS = {
"过度负债": ["资产负债率", "负债", "债务", "有息负债", "偿债", "预警", "现金净流量"],
"财务金融风险": ["穿透", "实际控制人", "最终受益人", "资金流向", "交易对手", "账户"],
"虚假贸易": ["贸易", "合同", "交易", "空转", "融资性贸易"],
}
PENETRATION_KEYWORDS = {
"主体穿透": ["实际控制人", "最终受益人", "责任主体", "关联方"],
"资金穿透": ["资金流向", "账户", "资金闭环", "回款"],
"交易穿透": ["交易对手", "合同", "贸易", "交易链条"],
}
def normalize_granularity(self, granularity: str | None) -> str:
return self.GRANULARITY_MAP.get((granularity or "high").lower(), granularity or "high")
def is_supported_granularity(self, granularity: str | None) -> bool:
return (granularity or "high").lower() in self.GRANULARITY_MAP
def extract_text(self, content: bytes, filename: str) -> dict:
suffix = Path(filename).suffix.lower()
if suffix == ".pdf":
return {"method": "pdf", "status": "ok", "text": self._extract_pdf_text(content)}
if suffix == ".docx":
return {"method": "docx", "status": "ok", "text": self._extract_docx_text(content)}
if suffix == ".doc":
return {"method": "doc", "status": "ok", "text": self._extract_doc_text(content)}
if suffix == ".md":
return {"method": "markdown", "status": "ok", "text": content.decode("utf-8-sig", errors="ignore")}
return {"method": "plain_text", "status": "ok", "text": content.decode("utf-8-sig", errors="ignore")}
def _extract_pdf_text(self, content: bytes) -> str:
from pypdf import PdfReader
with tempfile.SpooledTemporaryFile() as stream:
stream.write(content)
stream.seek(0)
reader = PdfReader(stream)
return "\n".join(page.extract_text() or "" for page in reader.pages)
def _extract_docx_text(self, content: bytes) -> str:
from docx import Document
with tempfile.SpooledTemporaryFile() as stream:
stream.write(content)
stream.seek(0)
document = Document(stream)
return "\n".join(paragraph.text for paragraph in document.paragraphs if paragraph.text.strip())
def _extract_doc_text(self, content: bytes) -> str:
antiword = shutil.which("antiword")
if not antiword:
raise RuntimeError("解析 .doc 文件需要容器安装 antiword")
with tempfile.NamedTemporaryFile(suffix=".doc") as file:
file.write(content)
file.flush()
result = subprocess.run(
[antiword, file.name],
check=True,
capture_output=True,
text=True,
encoding="utf-8",
errors="ignore",
)
return result.stdout
def split_sentences(self, paragraphs: list[str]) -> list[dict]:
sentences: list[dict] = []
for paragraph_index, paragraph in enumerate(paragraphs, start=1):
for part in re.split(r"(?<=[。!?!?;])\s*", paragraph.strip()):
text = part.strip()
if text:
sentences.append({
"text": text,
"paragraph_index": paragraph_index,
"sentence_index": len(sentences) + 1,
})
return sentences
def extract_regulation_refs(self, text: str) -> list[str]:
refs = re.findall(r"《[^》]+》", text)
return list(dict.fromkeys(refs))
def select_candidates(
self,
domain: str,
note: str,
sentences: list[dict],
granularity: str = "high",
) -> list[dict]:
keywords = set(self.DOMAIN_KEYWORDS.get(domain, []))
keywords.update(token for token in re.split(r"[\s,,、;]+", note or "") if token)
if not keywords and domain:
keywords.add(domain)
candidates = []
for sentence in sentences:
text = sentence["text"]
matched = [keyword for keyword in keywords if keyword and keyword.lower() in text.lower()]
penetration_hits = [
keyword
for words in self.PENETRATION_KEYWORDS.values()
for keyword in words
if keyword in text
]
if not matched and not penetration_hits:
continue
dimension = self._dimension_for(text)
score = len(matched) * 2 + len(penetration_hits)
candidates.append({
**sentence,
"matched_keywords": matched or penetration_hits,
"supervision_dimension": dimension,
"score": score,
})
candidates.sort(key=lambda item: (-item["score"], item["sentence_index"]))
limit = 12 if self.normalize_granularity(granularity) == "low" else 30
return candidates[:limit]
def analyze(
self,
domain_record: dict,
content: bytes,
filename: str,
granularity: str = "high",
) -> dict:
normalized = self.normalize_granularity(granularity)
if not self.is_supported_granularity(granularity):
raise ValueError(f"不支持的 granularity: {granularity}")
extraction = self.extract_text(content, filename)
text = extraction["text"]
paragraphs = [line.strip() for line in re.split(r"\n+", text) if line.strip()]
sentences = self.split_sentences(paragraphs)
candidates = self.select_candidates(domain_record.get("domain", ""), domain_record.get("note", ""), sentences, normalized)
if not candidates and sentences:
candidates = [sentences[0] | {"matched_keywords": [], "supervision_dimension": "一般监管"}]
refs = [] if normalized == "low" else self.extract_regulation_refs(text)
patterns = []
for index, candidate in enumerate(candidates[: (12 if normalized == "low" else 30)], start=1):
source = candidate["text"]
basis = self._basis_text(source)
dimension = candidate.get("supervision_dimension") or self._dimension_for(source)
pattern_text = self._pattern_text(refs, basis, dimension)
usage = "用于后续穿透识别实际控制关系、资金流向和责任主体认定。" if dimension != "一般监管" else "用于后续规则生成、选表或领域风险识别的参考句式。"
patterns.append({
"id": f"pattern-{index:03d}",
"source_index": index,
"paragraph_index": candidate.get("paragraph_index", index),
"sentence_index": candidate.get("sentence_index", index),
"source_sentence": source,
"supervision_dimension": dimension,
"core_regulations": refs,
"basis_text": basis,
"description_pattern": pattern_text,
"usage": usage,
"relevance": "high" if candidate.get("score", 0) >= 3 else "medium",
"keywords": candidate.get("matched_keywords", []),
})
return {
"status": "done",
"domain": domain_record.get("domain", ""),
"filename": filename,
"extracted_at": datetime.now().isoformat(),
"policy_focus": "穿透式监管",
"core_regulations": refs,
"granularity": normalized,
"text_extraction": {"method": extraction["method"], "status": extraction["status"]},
"text_stats": {
"paragraph_count": len(paragraphs),
"sentence_count": len(sentences),
"candidate_count": len(candidates),
},
"description_patterns": patterns,
}
def _dimension_for(self, text: str) -> str:
for dimension, keywords in self.PENETRATION_KEYWORDS.items():
if any(keyword in text for keyword in keywords):
return dimension
return "一般监管"
def _basis_text(self, sentence: str) -> str:
text = re.sub(r"^《[^》]+》[^。;;]*[。;;]?", "", sentence).strip()
text = text or sentence.strip()
text = text.replace("国有企业", "企业")
if self._dimension_for(text) != "一般监管":
text = f"监管要求{text}"
return text if text.endswith(("", "", "")) else f"{text}"
def _pattern_text(self, refs: list[str], basis: str, dimension: str) -> str:
clause = f"条款要点:{basis.rstrip('')},防范相关风险。"
if dimension != "一般监管":
clause = f"条款要点:穿透识别关键主体、资金流向和责任边界,{basis.rstrip('')},防范责任悬空风险。"
if refs:
return f"核心法规:{''.join(refs)}{clause}"
return clause