from __future__ import annotations import re def extract_match_terms(text: str | None) -> list[str]: source = (text or "").lower() terms = [token for token in re.findall(r"[a-z0-9_]+", source) if len(token) >= 3] for chunk in re.findall(r"[\u4e00-\u9fff]+", text or ""): if len(chunk) >= 2: terms.append(chunk) if len(chunk) > 4: for index in range(len(chunk) - 1): terms.append(chunk[index : index + 2]) return list(dict.fromkeys(terms)) def score_text_match(query_text: str, *corpus_parts: str | None) -> tuple[float, list[str]]: query_terms = extract_match_terms(query_text) if not query_terms: return 0.0, [] corpus = " ".join(part for part in corpus_parts if part).lower() matched_terms = [term for term in query_terms if term and term in corpus] if not matched_terms: return 0.0, [] coverage = len(matched_terms) / max(len(query_terms), 1) density = min(len(matched_terms), 4) / 4 return round(min(1.0, coverage * 0.7 + density * 0.3), 3), matched_terms