1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
| import re, glob, sys, os
def parse_didctf(filepath):
with open(filepath, "r", encoding="utf-8") as f:
raw = f.read()
lines = raw.split("\n")
# --- Extract 案件介绍 (optional) ---
case_intro = ""
in_case = False
for line in lines:
if "**案件介绍**" in line:
in_case = True
continue
if "**检材下载**" in line:
break
if in_case and line.strip() and line.strip() != "展开全部":
case_intro += line.strip() + "\n\n"
# --- Extract 检材下载 (optional) ---
evidence = ""
in_evidence = False
for line in lines:
if "**检材下载**" in line:
in_evidence = True
continue
if any(kw in line for kw in ["题目纠错", "答题进度", "题目列表"]):
break
if in_evidence and line.strip():
evidence += line.strip() + "\n"
# --- Parse questions (universal for DIDCTF) ---
skip_words = {"标记讨论", "查看答案", "题目纠错", "我的纠错", "展开全部",
"列表视图卡片视图", "答题进度", "题目列表", "分类导航", "题目筛选"}
categories = {}
current_cat = None
questions = []
for i, line in enumerate(lines):
s = line.strip()
if not s:
continue
if s.startswith((".outer-corner", ".practice-", ".corr-", ".discussion-", ".case-intro", ".my-corr")):
continue
if s.startswith(("{", "@keyframes")) or "!important" in s:
continue
if s in skip_words:
continue
if re.match(r'^\d+个用户已解出', s) or re.match(r'^Pixel \d', s):
continue
m = re.match(r'^(.+?)-(\d+)简答分值\s+(\d+)', s)
if m:
catname = m.group(1)
if catname != current_cat:
if current_cat and questions:
categories[current_cat] = questions
current_cat = catname
questions = []
desc = ""
answer_format = ""
for j in range(i + 1, min(i + 8, len(lines))):
nl = lines[j].strip()
if not nl or nl in ("标记讨论", "查看答案"):
continue
if re.match(r'^\d+个用户已解出', nl):
continue
if re.match(r'^(.+?)-(\d+)简答分值', nl):
break
afm = re.search(r'[【\[((](?:答案格式|答案)[::;]\s*(.+?)[】\]))]', nl)
if afm:
answer_format = afm.group(1).strip()
if not desc:
desc = nl
desc = re.sub(r'[【\[((](?:答案格式|答案)[::;].+?[】\]))]', '', desc)
desc = re.sub(r'【提示[::].+?】', '', desc)
questions.append({"desc": desc.strip(), "answer": answer_format})
if current_cat and questions:
categories[current_cat] = questions
# --- Build output ---
out = []
if case_intro.strip():
out.append("# 案件介绍\n\n")
out.append(case_intro.strip() + "\n\n")
if evidence.strip():
out.append("# 案件与检材\n\n")
out.append(evidence.strip() + "\n\n")
out.append("# 题目分类\n\n")
for catname, qs in categories.items():
out.append(f"## {catname}\n\n")
for q in qs:
line = f"### {q['desc']}"
if q['answer']:
line += f"【答案格式:{q['answer']}】"
out.append(line + "\n\n")
out.append("\n")
output = "".join(out)
base, _ = os.path.splitext(filepath)
output_path = base + "_整理.md"
with open(output_path, "w", encoding="utf-8") as f:
f.write(output)
return output_path, len(categories), sum(len(qs) for qs in categories.values()), categories
if __name__ == "__main__":
if len(sys.argv) > 1:
filepath = sys.argv[1]
else:
# Auto-find: latest DIDCTF_*.md in E:\google
files = [f for f in glob.glob(r"E:\google\DIDCTF_*.md") if "整理" not in f]
if not files:
print("用法: python didctf_parse.py <markdown文件路径>")
print("或把文件放到 E:\\google\\DIDCTF_*.md")
sys.exit(1)
filepath = max(files, key=os.path.getmtime)
outpath, ncats, total, cats = parse_didctf(filepath)
print(f"输入: {filepath}")
print(f"输出: {outpath}")
print(f"分类: {ncats} | 题目: {total}")
for cat, qs in cats.items():
print(f" {cat}: {len(qs)}题")
|