From 9d82c9294f9ec2b383c88d8d38962d605ba94368 Mon Sep 17 00:00:00 2001 From: baol Date: Wed, 25 Dec 2024 09:39:53 +0800 Subject: [PATCH] add feature --- case.py | 70 ++++++++++++++++++++++++++++++++++++++++++++++++ case1.py | 47 ++++++++++++++++++++++++++++++++ case2_in.py | 45 +++++++++++++++++++++++++++++++ case2_out.py | 45 +++++++++++++++++++++++++++++++ requirements.txt | 4 ++- 5 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 case.py create mode 100644 case1.py create mode 100644 case2_in.py create mode 100644 case2_out.py diff --git a/case.py b/case.py new file mode 100644 index 0000000..974e5bd --- /dev/null +++ b/case.py @@ -0,0 +1,70 @@ +import pandas as pd + +# 读取Excel文件 +file_path = "/home/baol/tools/数据草稿-案款.xls" # 替换为你的Excel文件路径 + +# 读取“账户收”工作表,只保留指定列 +sheet_account_receipt = pd.read_excel(file_path, sheet_name="账户收") +sheet_account_receipt = sheet_account_receipt[ + ["归属本案金额", "到账金额", "到账日期", "缴款人", "案号"] +] + +# 读取“收入”工作表,表头从第二行开始,只保留指定列 +sheet_income = pd.read_excel(file_path, sheet_name="收入", header=1) +sheet_income = sheet_income[["对方户名", "交易时间", "金额"]] + +# 统一日期格式 +sheet_account_receipt["到账日期"] = pd.to_datetime( + sheet_account_receipt["到账日期"] +).dt.strftime("%Y-%m-%d") +sheet_income["交易时间"] = pd.to_datetime(sheet_income["交易时间"]).dt.strftime( + "%Y-%m-%d" +) + +# 重命名列以便于合并 +sheet_account_receipt.rename( + columns={"到账金额": "金额", "到账日期": "交易时间"}, inplace=True +) + +# 合并两个数据框 +merged_df = pd.merge( + sheet_account_receipt, sheet_income, on=["金额", "交易时间"], how="inner" +) + +# 排除在某一天同时有多笔金额相同的记录 +filtered_df = merged_df.groupby(["金额", "交易时间"]).filter(lambda x: len(x) == 1) + +# 检索出在某一天同时有多笔金额相同的记录 +duplicate_records = merged_df.groupby(["金额", "交易时间"]).filter(lambda x: len(x) > 1) + +# 将结果保存为新的Excel文件 +output_file_path_filtered = "/home/baol/tools/matched_records.xlsx" # 匹配结果 +output_file_path_duplicates = "/home/baol/tools/duplicate_records.xlsx" # 重复记录 + +filtered_df.to_excel(output_file_path_filtered, index=False, engine="openpyxl") +duplicate_records.to_excel(output_file_path_duplicates, index=False, engine="openpyxl") + +print(f"匹配结果已保存到 {output_file_path_filtered}") +print(f"重复记录已保存到 {output_file_path_duplicates}") + + +# 合并两个数据框,并标记匹配情况 +out_merged_df = pd.merge( + sheet_account_receipt, + sheet_income, + on=['金额', '交易时间'], + how='outer', # 使用外连接以保留未匹配的记录 + indicator=True # 标记匹配情况 +) + +# 筛选出未匹配的记录 +unmatched_records = out_merged_df[out_merged_df['_merge'] != 'both'] + +# 删除用于标记匹配情况的列 +unmatched_records.drop(columns=['_merge'], inplace=True) + +# 将结果保存为新的Excel文件 +output_file_path_unmatched = '/home/baol/tools/unmatched_records.xlsx' # 未匹配记录 +unmatched_records.to_excel(output_file_path_unmatched, index=False, engine='openpyxl') + +print(f"未匹配的所有记录已保存到 {output_file_path_unmatched}") \ No newline at end of file diff --git a/case1.py b/case1.py new file mode 100644 index 0000000..64cc506 --- /dev/null +++ b/case1.py @@ -0,0 +1,47 @@ +import pandas as pd + +# 读取Excel文件 +file_path = "/home/baol/tools/数据草稿-案款.xls" # 替换为你的Excel文件路径 +sheet_account_receipt = pd.read_excel(file_path, sheet_name="账户收") +sheet_income = pd.read_excel(file_path, sheet_name="收入", header=1) + +#归属本案金额、到账金额、到账日期、缴款人、案号 +#对方户名、交易时间、金额 + +# 统一日期格式 +sheet_account_receipt["到账日期"] = pd.to_datetime( + sheet_account_receipt["到账日期"] +).dt.strftime("%Y-%m-%d") +sheet_income["交易时间"] = pd.to_datetime(sheet_income["交易时间"]).dt.strftime( + "%Y-%m-%d" +) + +# 重命名列以便于合并 +sheet_account_receipt.rename( + columns={"到账金额": "金额", "到账日期": "交易时间"}, inplace=True +) + +# 合并两个数据框 +merged_df = pd.merge( + sheet_account_receipt, sheet_income, on=["金额", "交易时间"], how="inner" +) + +# 排除在某一天同时有多笔金额相同的记录 +# 先按金额和交易时间分组,然后筛选出每个组中只有一条记录的情况 +# filtered_df = merged_df.groupby(["金额", "交易时间"]).filter(lambda x: len(x) == 1) + +# # 输出结果 +# print(filtered_df) + +# output_file_path = "/home/baol/tools/case1_records.xlsx" # 替换为你希望保存的文件路径 +# filtered_df.to_excel(output_file_path, index=False) + +# print(f"匹配结果已保存到 {output_file_path}") + + +# 检索出在某一天同时有多笔金额相同的记录 +duplicate_records = merged_df.groupby(['金额', '交易时间']).filter(lambda x: len(x) > 1) + +# 将结果保存为新的Excel文件 +output_file_path = '/home/baol/tools/case2_records.xlsx' # 替换为你希望保存的文件路径 +duplicate_records.to_excel(output_file_path, index=False, engine='openpyxl') \ No newline at end of file diff --git a/case2_in.py b/case2_in.py new file mode 100644 index 0000000..cf5ef45 --- /dev/null +++ b/case2_in.py @@ -0,0 +1,45 @@ +import pandas as pd + +# 读取Excel文件 +file_path = "/home/baol/tools/数据草稿-案款1.xls" # 替换为你的Excel文件路径 + +# 读取“账户收”工作表,只保留指定列 +sheet_account_receipt = pd.read_excel(file_path, sheet_name="一案一账户收") +sheet_account_receipt = sheet_account_receipt[ + ["归属本案金额", "到账金额", "到账日期", "缴款人", "案号"] +] + +# 读取“收入”工作表,表头从第二行开始,只保留指定列 +sheet_income = pd.read_excel(file_path, sheet_name="银行-收入", header=1) +sheet_income = sheet_income[["对方户名", "交易时间", "金额"]] + +# 统一日期格式 +sheet_account_receipt["到账日期"] = pd.to_datetime( + sheet_account_receipt["到账日期"] +).dt.strftime("%Y-%m-%d") +sheet_income["交易时间"] = pd.to_datetime(sheet_income["交易时间"]).dt.strftime( + "%Y-%m-%d" +) + +# 合并两个数据框,保留各自的列名 +merged_df = pd.merge( + sheet_account_receipt, + sheet_income, + left_on=["到账金额", "到账日期"], # 账户收的匹配列 + right_on=["金额", "交易时间"], # 收入的匹配列 + how="outer", # 外连接以保留未匹配的记录 + suffixes=("_一案一账户收", "_银行收入"), # 为重复列添加后缀 +) + +# 筛选出未匹配的记录 +unmatched_records = merged_df[merged_df["到账金额"].isna() | merged_df["金额"].isna()] + +# 将结果保存为新的Excel文件 +output_file_path_merged = "/home/baol/tools/merged_records.xlsx" # 合并后的记录 +output_file_path_unmatched = "/home/baol/tools/unmatched_records.xlsx" # 未匹配的记录 + +merged_df.to_excel(output_file_path_merged, index=False, engine="openpyxl") +unmatched_records.to_excel(output_file_path_unmatched, index=False, engine="openpyxl") + +print(f"合并后的记录已保存到 {output_file_path_merged}") +print(f"未匹配的记录已保存到 {output_file_path_unmatched}") diff --git a/case2_out.py b/case2_out.py new file mode 100644 index 0000000..aa20053 --- /dev/null +++ b/case2_out.py @@ -0,0 +1,45 @@ +import pandas as pd + +# 读取Excel文件 +file_path = "/home/baol/tools/数据草稿-案款1.xls" # 替换为你的Excel文件路径 + +# 读取“账户收”工作表,只保留指定列 +sheet_account_receipt = pd.read_excel(file_path, sheet_name="一案一账户支") +sheet_account_receipt = sheet_account_receipt[ + ["支款案号", "来源案号", "申请人", "支付日期", "支付金额", "领款人"] +] + +# 读取“收入”工作表,表头从第二行开始,只保留指定列 +sheet_income = pd.read_excel(file_path, sheet_name="银行支出", header=1) +sheet_income = sheet_income[["对方户名", "交易时间", "支出金额"]] + +# 统一日期格式 +sheet_account_receipt["支付日期"] = pd.to_datetime( + sheet_account_receipt["支付日期"] +).dt.strftime("%Y-%m-%d") +sheet_income["交易时间"] = pd.to_datetime(sheet_income["交易时间"]).dt.strftime( + "%Y-%m-%d" +) + +# 合并两个数据框,保留各自的列名 +merged_df = pd.merge( + sheet_account_receipt, + sheet_income, + left_on=["支付金额", "支付日期"], # 账户收的匹配列 + right_on=["支出金额", "交易时间"], # 收入的匹配列 + how="outer", # 外连接以保留未匹配的记录 + suffixes=("_一案一账户支", "_银行支出"), # 为重复列添加后缀 +) + +# 筛选出未匹配的记录 +unmatched_records = merged_df[merged_df["支付金额"].isna() | merged_df["支出金额"].isna()] + +# 将结果保存为新的Excel文件 +output_file_path_merged = "/home/baol/tools/merged_out_records.xlsx" # 合并后的记录 +output_file_path_unmatched = "/home/baol/tools/unmatched_out_records.xlsx" # 未匹配的记录 + +merged_df.to_excel(output_file_path_merged, index=False, engine="openpyxl") +unmatched_records.to_excel(output_file_path_unmatched, index=False, engine="openpyxl") + +print(f"合并后的记录已保存到 {output_file_path_merged}") +print(f"未匹配的记录已保存到 {output_file_path_unmatched}") diff --git a/requirements.txt b/requirements.txt index d811548..239c94d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,4 @@ pandas -psycopg[binary] \ No newline at end of file +psycopg[binary] +xlrd +openpyxl \ No newline at end of file