import pandas as pd # 读取Excel文件的指定工作表和列 df1 = pd.read_excel( "/home/baol/tools/数据草稿-案款1.xls", sheet_name="一案一账户支", usecols=["支款案号", "来源案号", "申请人", "支付日期", "支付金额", "领款人"], ) df2 = pd.read_excel( "/home/baol/tools/数据草稿-案款1.xls", sheet_name="银行支出", usecols=["对方户名", "交易时间", "支出金额"], header=1, ) # 标准化日期格式为 yyyy-mm-dd df1["支付日期"] = pd.to_datetime(df1["支付日期"]).dt.strftime("%Y-%m-%d") df2["交易时间"] = pd.to_datetime(df2["交易时间"]).dt.strftime("%Y-%m-%d") # 确保金额为数值类型 df1["支付金额"] = pd.to_numeric(df1["支付金额"], errors="coerce") df2["支出金额"] = pd.to_numeric(df2["支出金额"], errors="coerce") # 初始化任务标记列 df1["任务"] = "任务3" df2["任务"] = "任务3" # 任务2:查找在某一天有多笔金额相同的记录 # 标记df1中的重复记录 df1_duplicates = df1[df1.duplicated(subset=["支付日期", "支付金额"], keep=False)] df1.loc[df1_duplicates.index, "任务"] = "任务2" # 标记df2中的重复记录 df2_duplicates = df2[df2.duplicated(subset=["交易时间", "支出金额"], keep=False)] df2.loc[df2_duplicates.index, "任务"] = "任务2" # 为匹配准备唯一记录(任务1) df1_unique = ( df1[df1["任务"] == "任务3"].reset_index().rename(columns={"index": "df1_index"}) ) df2_unique = ( df2[df2["任务"] == "任务3"].reset_index().rename(columns={"index": "df2_index"}) ) # 创建辅助列用于匹配 df1_unique["日期"] = df1_unique["支付日期"] df1_unique["金额"] = df1_unique["支付金额"] df1_unique["名称"] = df1_unique["领款人"] df2_unique["日期"] = df2_unique["交易时间"] df2_unique["金额"] = df2_unique["支出金额"] df2_unique["名称"] = df2_unique["对方户名"] # 条件1:支付金额与支出金额匹配,支付日期与交易时间匹配 condition1_matches = pd.merge( df1_unique, df2_unique, on=["日期", "金额"], how="inner", suffixes=("_df1", "_df2") ) df1.loc[condition1_matches["df1_index"], "任务"] = "任务1" df2.loc[condition1_matches["df2_index"], "任务"] = "任务1" # 条件2:支付金额与支出金额匹配,领款人与对方户名匹配 condition2_matches = pd.merge( df1_unique, df2_unique, on=["金额", "名称"], how="inner", suffixes=("_df1", "_df2") ) df1.loc[condition2_matches["df1_index"], "任务"] = "任务1" df2.loc[condition2_matches["df2_index"], "任务"] = "任务1" # 合并条件匹配的结果 task1_matches = pd.concat([condition1_matches, condition2_matches], ignore_index=True) task1_matches = task1_matches.drop_duplicates(subset=["df1_index", "df2_index"]) # 任务3:提取不符合任务1和任务2的记录 task3_df1 = df1[df1["任务"] == "任务3"] task3_df2 = df2[df2["任务"] == "任务3"] # 创建一个新的Excel文件,保存结果 with pd.ExcelWriter("/home/baol/tools/匹配结果.xlsx") as writer: # 任务1结果 task1_result = task1_matches[ [ "支款案号", "来源案号", "申请人", "支付日期", "支付金额", "领款人", "对方户名", "交易时间", "支出金额", ] ] task1_result.to_excel(writer, sheet_name="任务1匹配结果", index=False) # 任务2结果 df1_duplicates.to_excel( writer, sheet_name="任务2_一案一账户支重复记录", index=False ) df2_duplicates.to_excel(writer, sheet_name="任务2_银行支出重复记录", index=False) # 任务3结果 task3_df1.to_excel(writer, sheet_name="任务3_一案一账户支未匹配记录", index=False) task3_df2.to_excel(writer, sheet_name="任务3_支出未匹配记录", index=False) print("结果已保存到 '匹配结果.xlsx' 文件中。")