river-sim/case_out.py
2024-12-25 16:38:07 +08:00

105 lines
3.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import pandas as pd
# 读取Excel文件的指定工作表和列
df1 = pd.read_excel(
"/home/baol/tools/数据草稿-案款1.xls",
sheet_name="一案一账户支",
usecols=["支款案号", "来源案号", "申请人", "支付日期", "支付金额", "领款人"],
)
df2 = pd.read_excel(
"/home/baol/tools/数据草稿-案款1.xls",
sheet_name="银行支出",
usecols=["对方户名", "交易时间", "支出金额"],
header=1,
)
# 标准化日期格式为 yyyy-mm-dd
df1["支付日期"] = pd.to_datetime(df1["支付日期"]).dt.strftime("%Y-%m-%d")
df2["交易时间"] = pd.to_datetime(df2["交易时间"]).dt.strftime("%Y-%m-%d")
# 确保金额为数值类型
df1["支付金额"] = pd.to_numeric(df1["支付金额"], errors="coerce")
df2["支出金额"] = pd.to_numeric(df2["支出金额"], errors="coerce")
# 初始化任务标记列
df1["任务"] = "任务3"
df2["任务"] = "任务3"
# 任务2查找在某一天有多笔金额相同的记录
# 标记df1中的重复记录
df1_duplicates = df1[df1.duplicated(subset=["支付日期", "支付金额"], keep=False)]
df1.loc[df1_duplicates.index, "任务"] = "任务2"
# 标记df2中的重复记录
df2_duplicates = df2[df2.duplicated(subset=["交易时间", "支出金额"], keep=False)]
df2.loc[df2_duplicates.index, "任务"] = "任务2"
# 为匹配准备唯一记录任务1
df1_unique = (
df1[df1["任务"] == "任务3"].reset_index().rename(columns={"index": "df1_index"})
)
df2_unique = (
df2[df2["任务"] == "任务3"].reset_index().rename(columns={"index": "df2_index"})
)
# 创建辅助列用于匹配
df1_unique["日期"] = df1_unique["支付日期"]
df1_unique["金额"] = df1_unique["支付金额"]
df1_unique["名称"] = df1_unique["领款人"]
df2_unique["日期"] = df2_unique["交易时间"]
df2_unique["金额"] = df2_unique["支出金额"]
df2_unique["名称"] = df2_unique["对方户名"]
# 条件1支付金额与支出金额匹配支付日期与交易时间匹配
condition1_matches = pd.merge(
df1_unique, df2_unique, on=["日期", "金额"], how="inner", suffixes=("_df1", "_df2")
)
df1.loc[condition1_matches["df1_index"], "任务"] = "任务1"
df2.loc[condition1_matches["df2_index"], "任务"] = "任务1"
# 条件2支付金额与支出金额匹配领款人与对方户名匹配
condition2_matches = pd.merge(
df1_unique, df2_unique, on=["金额", "名称"], how="inner", suffixes=("_df1", "_df2")
)
df1.loc[condition2_matches["df1_index"], "任务"] = "任务1"
df2.loc[condition2_matches["df2_index"], "任务"] = "任务1"
# 合并条件匹配的结果
task1_matches = pd.concat([condition1_matches, condition2_matches], ignore_index=True)
task1_matches = task1_matches.drop_duplicates(subset=["df1_index", "df2_index"])
# 任务3提取不符合任务1和任务2的记录
task3_df1 = df1[df1["任务"] == "任务3"]
task3_df2 = df2[df2["任务"] == "任务3"]
# 创建一个新的Excel文件保存结果
with pd.ExcelWriter("/home/baol/tools/匹配结果.xlsx") as writer:
# 任务1结果
task1_result = task1_matches[
[
"支款案号",
"来源案号",
"申请人",
"支付日期",
"支付金额",
"领款人",
"对方户名",
"交易时间",
"支出金额",
]
]
task1_result.to_excel(writer, sheet_name="任务1匹配结果", index=False)
# 任务2结果
df1_duplicates.to_excel(
writer, sheet_name="任务2_一案一账户支重复记录", index=False
)
df2_duplicates.to_excel(writer, sheet_name="任务2_银行支出重复记录", index=False)
# 任务3结果
task3_df1.to_excel(writer, sheet_name="任务3_一案一账户支未匹配记录", index=False)
task3_df2.to_excel(writer, sheet_name="任务3_支出未匹配记录", index=False)
print("结果已保存到 '匹配结果.xlsx' 文件中。")