1. 修改了表名表述的问题

2. 修改了生成别的列表述中文字段名的问题
2025-12-23 16:21:01 +08:00
parent 1a722de9a1
commit f408c87564
1 changed files with 71 additions and 84 deletions
--- a/qa_generator.py
+++ b/qa_generator.py
@@ -108,41 +108,41 @@ class QAGenerator:
        if data_type == "element":
            # 元素治理模板 - 严格基于"数据元素中文名"
            templates = []
-            table_name = item.get("表名", "元素治理模板")
+            table_name = item.get("表名", "远光数据架构元素治理模板表")

            # 只保留以数据元素中文名为标识符的模板
            if item.get("数据元素中文名") and item.get("业务领域名称"):
-                templates.append((f"「{item['数据元素中文名']}」在{table_name}中属于哪个业务领域？", f"该数据元素属于「{item['业务领域名称']}」"))
+                templates.append((f"在{table_name}中数据元素中文名为：{item['数据元素中文名']}属于哪个业务领域？", f"业务领域：{item['业务领域名称']}"))

            if item.get("数据元素中文名") and item.get("值类型"):
-                templates.append((f"查询{table_name}中，数据元素「{item['数据元素中文名']}」的值类型是什么？", f"值类型为「{item['值类型']}」"))
+                templates.append((f"查询{table_name}中数据元素中文名为：{item['数据元素中文名']}的值类型是什么？", f"值类型：{item['值类型']}"))

            if item.get("数据元素中文名") and item.get("总长度"):
-                templates.append((f"在{table_name}里，「{item['数据元素中文名']}」的总长度设置是多少？", f"总长度为{item['总长度']}"))
+                templates.append((f"在{table_name}中数据元素中文名为：{item['数据元素中文名']}的总长度设置是多少？", f"总长度：{item['总长度']}"))

            if item.get("数据元素中文名") and item.get("类别"):
-                templates.append((f"请确认「{item['数据元素中文名']}」在{table_name}中属于哪个类别？", f"该数据元素属于「{item['类别']}」类别"))
+                templates.append((f"请确认在{table_name}中数据元素中文名为：{item['数据元素中文名']}属于哪个类别？", f"类别：{item['类别']}"))

            if item.get("数据元素中文名") and item.get("数据元素英文名"):
-                templates.append((f"请查找「{item['数据元素中文名']}」对应的英文名是什么？", f"英文名为「{item['数据元素英文名']}」"))
+                templates.append((f"在{table_name}中数据元素中文名为：{item['数据元素中文名']}对应的英文名是什么？", f"英文名：{item['数据元素英文名']}"))

            if item.get("数据元素中文名") and item.get("是否枚举"):
-                templates.append((f"「{item['数据元素中文名']}」这个数据元素在{table_name}中是否枚举？", f"是否枚举为「{item['是否枚举']}」"))
+                templates.append((f"在{table_name}中数据元素中文名为：{item['数据元素中文名']}是否枚举？", f"是否枚举：{item['是否枚举']}"))

            if item.get("数据元素中文名") and item.get("枚举数量"):
-                templates.append((f"请问在{table_name}中，「{item['数据元素中文名']}」的枚举数量是多少？", f"枚举数量为{item['枚举数量']}"))
+                templates.append((f"请问在{table_name}中数据元素中文名为：{item['数据元素中文名']}的枚举数量是多少？", f"枚举数量：{item['枚举数量']}"))

            if item.get("数据元素中文名") and item.get("小数位"):
-                templates.append((f"请说明「{item['数据元素中文名']}」的小数位设置？", f"小数位为{item['小数位']}"))
+                templates.append((f"在{table_name}中数据元素中文名为：{item['数据元素中文名']}的小数位设置是多少？", f"小数位：{item['小数位']}"))

            if item.get("数据元素中文名") and item.get("抽象元素中文名"):
-                templates.append((f"在{table_name}里，「{item['数据元素中文名']}」的抽象元素中文名是什么？", f"抽象元素中文名为「{item['抽象元素中文名']}」"))
+                templates.append((f"在{table_name}中数据元素中文名为：{item['数据元素中文名']}的抽象元素中文名是什么？", f"抽象元素中文名：{item['抽象元素中文名']}"))

            if item.get("数据元素中文名") and item.get("说明"):
-                templates.append((f"请解释「{item['数据元素中文名']}」在{table_name}中的作用和含义", f"该数据元素的说明为「{item['说明']}」"))
+                templates.append((f"请解释在{table_name}中数据元素中文名为：{item['数据元素中文名']}的作用和含义", f"说明：{item['说明']}"))

            if item.get("数据元素中文名") and item.get("是否上线"):
-                templates.append((f"请问「{item['数据元素中文名']}」在{table_name}中是否已上线？", f"是否上线为「{item['是否上线']}」"))
+                templates.append((f"请问在{table_name}中数据元素中文名为：{item['数据元素中文名']}是否已上线？", f"是否上线：{item['是否上线']}"))

            # 生成QA
            for i, (question, answer) in enumerate(templates[:template_count]):
@@ -153,28 +153,32 @@ class QAGenerator:
                })

        elif data_type == "physical":
-            # 物理模型 - 严格基于"物理模型属性中文名"
-            table_name = item.get("表名", "物理模型")
+            # 物理模型 - 严格基于"字段中文名"提问
+            table_name = item.get("表名", "远光数据架构物理模型表")
+            field_name = item.get("字段中文名")
            templates = []

-            # 只保留以物理模型属性中文名为标识符的模板
-            if item.get("物理模型属性中文名") and item.get("值类型"):
-                templates.append((f"在{table_name}中，「{item['物理模型属性中文名']}」的值类型是什么？", f"该属性的值类型为「{item['值类型']}」"))
+            # 以字段中文名为主要提问对象
+            if field_name and item.get("值类型"):
+                templates.append((f"请问在{table_name}中字段中文名为：{field_name}的值类型是什么？", f"值类型：{item['值类型']}"))

-            if item.get("物理模型属性中文名") and item.get("长度"):
-                templates.append((f"请问「{item['物理模型属性中文名']}」在{table_name}中的长度是多少？", f"该属性长度为{item['长度']}"))
+            if field_name and item.get("长度"):
+                templates.append((f"请问在{table_name}中字段中文名为：{field_name}的长度是多少？", f"长度：{item['长度']}"))

-            if item.get("物理模型属性中文名") and item.get("小数位") is not None:
-                templates.append((f"请确认「{item['物理模型属性中文名']}」在{table_name}中的小数位设置", f"该属性小数位为{item['小数位']}"))
+            if field_name and item.get("小数位") is not None:
+                templates.append((f"请问在{table_name}中字段中文名为：{field_name}的小数位设置是多少？", f"小数位：{item['小数位']}"))

-            if item.get("物理模型属性中文名") and item.get("关联数据元素"):
-                templates.append((f"「{item['物理模型属性中文名']}」这个属性在{table_name}中关联哪个数据元素？", f"该属性关联的数据元素为「{item['关联数据元素']}」"))
+            if field_name and item.get("关联数据元素"):
+                templates.append((f"请问在{table_name}中字段中文名为：{field_name}关联的数据元素是什么？", f"关联数据元素：{item['关联数据元素']}"))

-            if item.get("物理模型属性中文名") and item.get("物理模型中文名"):
-                templates.append((f"请查找「{item['物理模型属性中文名']}」属于哪个物理模型？", f"该属性属于「{item['物理模型中文名']}」"))
+            if field_name and item.get("物理模型中文名"):
+                templates.append((f"请问在{table_name}中字段中文名为：{field_name}属于哪个物理模型？", f"物理模型：{item['物理模型中文名']}"))

-            if item.get("物理模型属性中文名") and item.get("说明"):
-                templates.append((f"请说明「{item['物理模型属性中文名']}」在{table_name}中的作用和用途", f"该属性的说明为「{item['说明']}」"))
+            if field_name and item.get("说明"):
+                templates.append((f"请问在{table_name}中字段中文名为：{field_name}的说明是什么？", f"说明：{item['说明']}"))
+
+            if field_name and item.get("物理模型属性英文名"):
+                templates.append((f"请问在{table_name}中字段中文名为：{field_name}对应的英文名是什么？", f"英文名：{item['物理模型属性英文名']}"))

            # 生成QA
            for i, (question, answer) in enumerate(templates[:template_count]):
@@ -186,33 +190,33 @@ class QAGenerator:

        elif data_type == "logical":
            # 逻辑模型 - 严格基于"字段中文名"
-            table_name = item.get("表名", "逻辑模型")
+            table_name = item.get("表名", "远光数据架构逻辑模型表")
            templates = []

            # 只保留以字段中文名为标识符的模板
            if item.get("字段中文名") and item.get("业务领域"):
-                templates.append((f"「{item['字段中文名']}」在{table_name}中属于哪个业务领域？", f"该字段属于「{item['业务领域']}」"))
+                templates.append((f"在{table_name}中字段中文名为：{item['字段中文名']}属于哪个业务领域？", f"业务领域：{item['业务领域']}"))

            if item.get("字段中文名") and item.get("逻辑模型中文名"):
-                templates.append((f"「{item['字段中文名']}」属于哪个逻辑模型？", f"该字段属于「{item['逻辑模型中文名']}」"))
+                templates.append((f"在{table_name}中字段中文名为：{item['字段中文名']}属于哪个逻辑模型？", f"逻辑模型：{item['逻辑模型中文名']}"))

            if item.get("字段中文名") and item.get("字段英文名"):
-                templates.append((f"在{table_name}中，「{item['字段中文名']}」对应的英文名是什么？", f"该字段的英文名为「{item['字段英文名']}」"))
+                templates.append((f"在{table_name}中字段中文名为：{item['字段中文名']}对应的英文名是什么？", f"英文名：{item['字段英文名']}"))

            if item.get("字段中文名") and item.get("值类型"):
-                templates.append((f"请问「{item['字段中文名']}」在{table_name}中的值类型是什么？", f"该字段的值类型为「{item['值类型']}」"))
+                templates.append((f"请问在{table_name}中字段中文名为：{item['字段中文名']}的值类型是什么？", f"值类型：{item['值类型']}"))

            if item.get("字段中文名") and item.get("长度"):
-                templates.append((f"查询{table_name}中，「{item['字段中文名']}」的长度是多少？", f"该字段长度为{item['长度']}"))
+                templates.append((f"查询{table_name}中字段中文名为：{item['字段中文名']}的长度是多少？", f"长度：{item['长度']}"))

            if item.get("字段中文名") and item.get("小数位") is not None:
-                templates.append((f"请确认「{item['字段中文名']}」在{table_name}中的小数位设置", f"该字段小数位为{item['小数位']}"))
+                templates.append((f"请确认在{table_name}中字段中文名为：{item['字段中文名']}的小数位设置", f"小数位：{item['小数位']}"))

            if item.get("字段中文名") and item.get("动态查询能力"):
-                templates.append((f"「{item['字段中文名']}」的动态查询能力是什么级别？", f"该字段的动态查询能力为「{item['动态查询能力']}」"))
+                templates.append((f"在{table_name}中字段中文名为：{item['字段中文名']}的动态查询能力是什么级别？", f"动态查询能力：{item['动态查询能力']}"))

            if item.get("字段中文名") and item.get("关联数据元素英文名"):
-                templates.append((f"在{table_name}中，「{item['字段中文名']}」关联的数据元素英文名是什么？", f"该字段关联的数据元素英文名为「{item['关联数据元素英文名']}」"))
+                templates.append((f"在{table_name}中字段中文名为：{item['字段中文名']}关联的数据元素英文名是什么？", f"关联数据元素英文名：{item['关联数据元素英文名']}"))

            # 生成QA
            for i, (question, answer) in enumerate(templates[:template_count]):
@@ -254,7 +258,7 @@ class QAGenerator:
            field_mapping = {
                "物理模型中文名": item.get("物理模型中文名"),
                "物理模型英文名": item.get("物理模型英文名"),
-                "物理模型属性中文名": item.get("物理模型属性中文名"),
+                "字段中文名": item.get("字段中文名"),
                "物理模型属性英文名": item.get("物理模型属性英文名"),
                "值类型": item.get("值类型"),
                "长度": item.get("长度"),
@@ -289,7 +293,7 @@ class QAGenerator:
            # 优先确保包含标识字段的组合
            identifier_fields = {
                "element": "数据元素中文名",
-                "physical": "物理模型属性中文名",
+                "physical": "字段中文名",
                "logical": "字段中文名"
            }

@@ -321,14 +325,14 @@ class QAGenerator:
        question_parts = []
        answer_parts = []

-        table_name = item.get("表名", "数据表")
+        table_name = item.get("表名", "远光数据架构表")

        # 当字段数量为1时，直接询问这个字段
        if field_count == 1:
            # 直接使用第一个字段作为标识和要询问的字段
            field_name, field_value = selected_fields[0]
-            question = f"请告诉我{table_name}中「{field_value}」是什么"
-            answer = f"{field_name}为「{field_value}」"
+            question = f"请告诉我{table_name}中{field_value}是什么"
+            answer = f"{field_name}：{field_value}"

            qa_pairs.append({
                "instruct": question,
@@ -345,7 +349,7 @@ class QAGenerator:
        # 查找标识字段
        identifier_fields = {
            "element": "数据元素中文名",
-            "physical": "物理模型属性中文名",
+            "physical": "字段中文名",
            "logical": "字段中文名"
        }

@@ -381,28 +385,25 @@ class QAGenerator:

        # 构建问题文本
        if len(question_parts) == 1:
-            question = f"请告诉我{table_name}中「{main_field}」的{question_parts[0]}"
+            question = f"请告诉我{table_name}中字段中文名为：{main_field}的{question_parts[0]}"
        elif len(question_parts) == 2:
            connector = self.config.get_random_element(connectors[:3])
-            question = f"请列举{table_name}中「{main_field}」的{question_parts[0]}{connector}{question_parts[1]}"
+            question = f"请列举{table_name}中字段中文名为：{main_field}的{question_parts[0]}{connector}{question_parts[1]}"
        else:
            connector1 = self.config.get_random_element(connectors[:3])
            connector2 = self.config.get_random_element(connectors[3:])
-            question = f"请列举{table_name}中「{main_field}」的{question_parts[0]}{connector1}{question_parts[1]}{connector2}{question_parts[2]}"
+            question = f"请列举{table_name}中字段中文名为：{main_field}的{question_parts[0]}{connector1}{question_parts[1]}{connector2}{question_parts[2]}"

        # 构建答案
        for field_name, field_value in query_fields:
-            answer_parts.append(f"{field_name}为「{field_value}」")
+            answer_parts.append(f"{field_name}：{field_value}")

        if len(answer_parts) == 1:
            answer = answer_parts[0]
        elif len(answer_parts) == 2:
-            connector = self.config.get_random_element(connectors)
-            answer = f"{answer_parts[0]}{connector}{answer_parts[1]}"
+            answer = f"{answer_parts[0]}；{answer_parts[1]}"
        else:
-            connector1 = self.config.get_random_element(connectors[:3])
-            connector2 = self.config.get_random_element(connectors[3:])
-            answer = f"{answer_parts[0]}{connector1}{answer_parts[1]}{connector2}{answer_parts[2]}"
+            answer = "；".join(answer_parts)

        # 生成QA对
        qa_pairs.append({
@@ -790,9 +791,8 @@ class QAGenerator:
                    # 生成综合性问答
                    if comprehensive_count > 0:
                        for _ in range(comprehensive_count):
-                            # 从文件名中提取表名（去掉.json后缀）
-                            table_display_name = file_info["name"]
-                            comprehensive_qa = self.generate_comprehensive_qa(item, data_type, table_display_name)
+                            # 直接从item中获取表名，不再使用file_info["name"]
+                            comprehensive_qa = self.generate_comprehensive_qa(item, data_type)
                            qa_pairs.extend(comprehensive_qa)

                qa_pairs = self.shuffle_qa_pairs(qa_pairs)
@@ -822,7 +822,7 @@ class QAGenerator:

        # 如果没有传入表名，从item中获取，否则使用默认值
        if table_name is None:
-            table_name = item.get("表名", "数据表")
+            table_name = item.get("表名", "远光数据架构表")

        if data_type == "element":
            # 元素治理模板的所有字段
@@ -845,7 +845,7 @@ class QAGenerator:
            field_mapping = {
                "物理模型中文名": item.get("物理模型中文名"),
                "物理模型英文名": item.get("物理模型英文名"),
-                "物理模型属性中文名": item.get("物理模型属性中文名"),
+                "字段中文名": item.get("字段中文名"),
                "物理模型属性英文名": item.get("物理模型属性英文名"),
                "值类型": item.get("值类型"),
                "长度": item.get("长度"),
@@ -880,7 +880,7 @@ class QAGenerator:
        # 获取标识字段
        identifier_fields = {
            "element": "数据元素中文名",
-            "physical": "物理模型属性中文名",
+            "physical": "字段中文名",
            "logical": "字段中文名"
        }

@@ -903,38 +903,25 @@ class QAGenerator:

        # 多种问法模板（加上表名进行区分）
        question_templates = [
-            f"在{table_name}中，请详细说明「{identifier_value}」的定义和相关信息",
-            f"在{table_name}中，「{identifier_value}」是什么意思？请解释其含义和特征",
-            f"请全面介绍{table_name}中「{identifier_value}」的概念和属性",
-            f"在{table_name}中，「{identifier_value}」是什么？请详细描述其特点",
-            f"请解释{table_name}中「{identifier_value}」的定义、作用和属性",
-            f"在{table_name}里，「{identifier_value}」具体指什么？请提供详细信息",
-            f"请说明{table_name}中「{identifier_value}」的含义、类型和相关属性",
-            f"在{table_name}中，「{identifier_value}」的概念和特征是什么？",
-            f"请详细介绍{table_name}中「{identifier_value}」的定义和相关信息",
-            f"在{table_name}中，「{identifier_value}」是什么概念？请解释其属性和含义"
+            f"在{table_name}中，请详细说明字段中文名为：{identifier_value}的定义和相关信息",
+            f"在{table_name}中，字段中文名为：{identifier_value}是什么意思？请解释其含义和特征",
+            f"请全面介绍{table_name}中字段中文名为：{identifier_value}的概念和属性",
+            f"在{table_name}中，字段中文名为：{identifier_value}是什么？请详细描述其特点",
+            f"请解释{table_name}中字段中文名为：{identifier_value}的定义、作用和属性",
+            f"在{table_name}里，字段中文名为：{identifier_value}具体指什么？请提供详细信息",
+            f"请说明{table_name}中字段中文名为：{identifier_value}的含义、类型和相关属性",
+            f"在{table_name}中，字段中文名为：{identifier_value}的概念和特征是什么？",
+            f"请详细介绍{table_name}中字段中文名为：{identifier_value}的定义和相关信息",
+            f"在{table_name}中，字段中文名为：{identifier_value}是什么概念？请解释其属性和含义"
        ]

        # 构建答案
        answer_parts = []
        for field_name, field_value in all_fields.items():
-            answer_parts.append(f"{field_name}为「{field_value}」")
+            answer_parts.append(f"{field_name}：{field_value}")

-        # 连接答案
-        if len(answer_parts) == 1:
-            answer = answer_parts[0]
-        elif len(answer_parts) == 2:
-            connector = self.config.get_random_element(self.config.CONNECTORS)
-            answer = f"{answer_parts[0]}{connector}{answer_parts[1]}"
-        else:
-            connectors = self.config.CONNECTORS
-            answer = answer_parts[0]
-            for part in answer_parts[1:-1]:
-                connector = self.config.get_random_element(connectors)
-                answer = f"{answer}{connector}{part}"
-            if answer_parts:
-                connector = self.config.get_random_element(connectors)
-                answer = f"{answer}{connector}{answer_parts[-1]}"
+        # 连接答案（使用分号分隔）
+        answer = "；".join(answer_parts)

        # 随机选择一个问法
        question = self.config.get_random_element(question_templates)
@@ -1195,7 +1182,7 @@ def main():
    # 设置是否生成综合性问答
    print("\n3. 综合性问答设置:")
    print("   综合性问答指一个问题询问所有字段的定义/含义")
-    print("   例如：「实际过账成本的成本中心」的定义是什么？")
+    print("   例如：实际过账成本的成本中心的定义是什么？")

    comprehensive_choice = input("是否添加综合性问答? (1=添加, 0=不添加, 默认0): ").strip()
    comprehensive_count = 0