add feature

This commit is contained in:
baol 2024-12-25 16:38:07 +08:00
parent a931d9ceb4
commit aafe5e0195
4 changed files with 208 additions and 5 deletions

View File

@ -40,8 +40,8 @@ merged_df = pd.merge(
# 检索出在某一天同时有多笔金额相同的记录
duplicate_records = merged_df.groupby(['金额', '交易时间']).filter(lambda x: len(x) > 1)
duplicate_records = merged_df.groupby(["金额", "交易时间"]).filter(lambda x: len(x) > 1)
# 将结果保存为新的Excel文件
output_file_path = '/home/baol/tools/case2_records.xlsx' # 替换为你希望保存的文件路径
duplicate_records.to_excel(output_file_path, index=False, engine='openpyxl')
output_file_path = "/home/baol/tools/case2_records.xlsx" # 替换为你希望保存的文件路径
duplicate_records.to_excel(output_file_path, index=False, engine="openpyxl")

99
case_int.py Normal file
View File

@ -0,0 +1,99 @@
import pandas as pd
# 1. 读取 Excel 文件中的数据
df_case = pd.read_excel(
"/home/baol/tools/数据草稿-案款1.xls", sheet_name="一案一账户收"
)
df_bank = pd.read_excel(
"/home/baol/tools/数据草稿-案款1.xls", sheet_name="银行-收入", header=1
)
# 2. 保留指定的列
# 在“一案一账户收”中,保留归属本案金额、到账金额、到账日期、缴款人、案号
df_case = df_case[["归属本案金额", "到账金额", "到账日期", "缴款人", "案号"]]
# 在“银行-收入”中,保留对方户名、交易时间、金额
df_bank = df_bank[["对方户名", "交易时间", "金额"]]
# 3. 统一日期格式为 yyyy-mm-dd
df_case["到账日期"] = pd.to_datetime(df_case["到账日期"]).dt.strftime("%Y-%m-%d")
df_bank["交易时间"] = pd.to_datetime(df_bank["交易时间"]).dt.strftime("%Y-%m-%d")
# 4. 任务1检索到账金额与金额匹配且到账日期与交易时间匹配的记录但排除在同一天有多笔金额相同的情况
# 合并两个数据框,匹配到账金额=金额,到账日期=交易时间
df_merged = pd.merge(
df_case,
df_bank,
left_on=["到账金额", "到账日期"],
right_on=["金额", "交易时间"],
how="inner",
)
# 计算每个日期和金额组合的出现次数
case_counts = (
df_case.groupby(["到账日期", "到账金额"]).size().reset_index(name="case_count")
)
bank_counts = (
df_bank.groupby(["交易时间", "金额"]).size().reset_index(name="bank_count")
)
# 合并统计数据
df_merged = pd.merge(
df_merged,
case_counts,
left_on=["到账日期", "到账金额"],
right_on=["到账日期", "到账金额"],
)
df_merged = pd.merge(
df_merged, bank_counts, left_on=["交易时间", "金额"], right_on=["交易时间", "金额"]
)
# 过滤出在同一天没有多笔金额相同的记录即计数为1
task1_result = df_merged[
(df_merged["case_count"] == 1) & (df_merged["bank_count"] == 1)
]
# 获取任务1中“一案一账户收”和“银行-收入”的索引
task1_case_indices = df_case[
df_case[["到账金额", "到账日期"]]
.apply(tuple, axis=1)
.isin(task1_result[["到账金额", "到账日期"]].apply(tuple, axis=1))
].index
task1_bank_indices = df_bank[
df_bank[["金额", "交易时间"]]
.apply(tuple, axis=1)
.isin(task1_result[["金额", "交易时间"]].apply(tuple, axis=1))
].index
# 5. 任务2检索出在某一天有多笔金额相同的记录
# 找出“一案一账户收”中同一天金额相同且次数大于1的记录
task2_case = df_case.groupby(["到账日期", "到账金额"]).filter(lambda x: len(x) > 1)
task2_case_indices = task2_case.index
# 找出“银行-收入”中同一天金额相同且次数大于1的记录
task2_bank = df_bank.groupby(["交易时间", "金额"]).filter(lambda x: len(x) > 1)
task2_bank_indices = task2_bank.index
# 6. 任务3检索出不符合任务1和任务2的记录
# 在“一案一账户收”中获取不在任务1和任务2中的索引
task3_case_indices = df_case.index.difference(
task1_case_indices.union(task2_case_indices)
)
task3_case_result = df_case.loc[task3_case_indices]
# 在“银行-收入”中获取不在任务1和任务2中的索引
task3_bank_indices = df_bank.index.difference(
task1_bank_indices.union(task2_bank_indices)
)
task3_bank_result = df_bank.loc[task3_bank_indices]
# 7. 将结果保存到 Excel 文件
with pd.ExcelWriter("/home/baol/tools/任务结果.xlsx") as writer:
task1_result.to_excel(writer, sheet_name="任务1", index=False)
task2_case.to_excel(writer, sheet_name="任务2_一案一账户收", index=False)
task2_bank.to_excel(writer, sheet_name="任务2_银行收入", index=False)
task3_case_result.to_excel(writer, sheet_name="任务3_一案一账户收", index=False)
task3_bank_result.to_excel(writer, sheet_name="任务3_银行收入", index=False)
print("所有任务结果已保存到 '任务结果.xlsx' 文件中。")

104
case_out.py Normal file
View File

@ -0,0 +1,104 @@
import pandas as pd
# 读取Excel文件的指定工作表和列
df1 = pd.read_excel(
"/home/baol/tools/数据草稿-案款1.xls",
sheet_name="一案一账户支",
usecols=["支款案号", "来源案号", "申请人", "支付日期", "支付金额", "领款人"],
)
df2 = pd.read_excel(
"/home/baol/tools/数据草稿-案款1.xls",
sheet_name="银行支出",
usecols=["对方户名", "交易时间", "支出金额"],
header=1,
)
# 标准化日期格式为 yyyy-mm-dd
df1["支付日期"] = pd.to_datetime(df1["支付日期"]).dt.strftime("%Y-%m-%d")
df2["交易时间"] = pd.to_datetime(df2["交易时间"]).dt.strftime("%Y-%m-%d")
# 确保金额为数值类型
df1["支付金额"] = pd.to_numeric(df1["支付金额"], errors="coerce")
df2["支出金额"] = pd.to_numeric(df2["支出金额"], errors="coerce")
# 初始化任务标记列
df1["任务"] = "任务3"
df2["任务"] = "任务3"
# 任务2查找在某一天有多笔金额相同的记录
# 标记df1中的重复记录
df1_duplicates = df1[df1.duplicated(subset=["支付日期", "支付金额"], keep=False)]
df1.loc[df1_duplicates.index, "任务"] = "任务2"
# 标记df2中的重复记录
df2_duplicates = df2[df2.duplicated(subset=["交易时间", "支出金额"], keep=False)]
df2.loc[df2_duplicates.index, "任务"] = "任务2"
# 为匹配准备唯一记录任务1
df1_unique = (
df1[df1["任务"] == "任务3"].reset_index().rename(columns={"index": "df1_index"})
)
df2_unique = (
df2[df2["任务"] == "任务3"].reset_index().rename(columns={"index": "df2_index"})
)
# 创建辅助列用于匹配
df1_unique["日期"] = df1_unique["支付日期"]
df1_unique["金额"] = df1_unique["支付金额"]
df1_unique["名称"] = df1_unique["领款人"]
df2_unique["日期"] = df2_unique["交易时间"]
df2_unique["金额"] = df2_unique["支出金额"]
df2_unique["名称"] = df2_unique["对方户名"]
# 条件1支付金额与支出金额匹配支付日期与交易时间匹配
condition1_matches = pd.merge(
df1_unique, df2_unique, on=["日期", "金额"], how="inner", suffixes=("_df1", "_df2")
)
df1.loc[condition1_matches["df1_index"], "任务"] = "任务1"
df2.loc[condition1_matches["df2_index"], "任务"] = "任务1"
# 条件2支付金额与支出金额匹配领款人与对方户名匹配
condition2_matches = pd.merge(
df1_unique, df2_unique, on=["金额", "名称"], how="inner", suffixes=("_df1", "_df2")
)
df1.loc[condition2_matches["df1_index"], "任务"] = "任务1"
df2.loc[condition2_matches["df2_index"], "任务"] = "任务1"
# 合并条件匹配的结果
task1_matches = pd.concat([condition1_matches, condition2_matches], ignore_index=True)
task1_matches = task1_matches.drop_duplicates(subset=["df1_index", "df2_index"])
# 任务3提取不符合任务1和任务2的记录
task3_df1 = df1[df1["任务"] == "任务3"]
task3_df2 = df2[df2["任务"] == "任务3"]
# 创建一个新的Excel文件保存结果
with pd.ExcelWriter("/home/baol/tools/匹配结果.xlsx") as writer:
# 任务1结果
task1_result = task1_matches[
[
"支款案号",
"来源案号",
"申请人",
"支付日期",
"支付金额",
"领款人",
"对方户名",
"交易时间",
"支出金额",
]
]
task1_result.to_excel(writer, sheet_name="任务1匹配结果", index=False)
# 任务2结果
df1_duplicates.to_excel(
writer, sheet_name="任务2_一案一账户支重复记录", index=False
)
df2_duplicates.to_excel(writer, sheet_name="任务2_银行支出重复记录", index=False)
# 任务3结果
task3_df1.to_excel(writer, sheet_name="任务3_一案一账户支未匹配记录", index=False)
task3_df2.to_excel(writer, sheet_name="任务3_支出未匹配记录", index=False)
print("结果已保存到 '匹配结果.xlsx' 文件中。")