add feature

2024-12-25 16:38:07 +08:00 · 2024-12-25 16:38:07 +08:00 · aafe5e0195
commit aafe5e0195
parent a931d9ceb4
4 changed files with 208 additions and 5 deletions
--- a/copy.py
+++ b/copy.py
--- a/case1.py
+++ b/case1.py
@ -5,8 +5,8 @@ file_path = "/home/baol/tools/数据草稿-案款.xls"  # 替换为你的Excel
 sheet_account_receipt = pd.read_excel(file_path, sheet_name="账户收")
 sheet_income = pd.read_excel(file_path, sheet_name="收入", header=1)
-#归属本案金额、到账金额、到账日期、缴款人、案号
+# 归属本案金额、到账金额、到账日期、缴款人、案号
-#对方户名、交易时间、金额
+# 对方户名、交易时间、金额
 # 统一日期格式
 sheet_account_receipt["到账日期"] = pd.to_datetime(
@ -40,8 +40,8 @@ merged_df = pd.merge(
 # 检索出在某一天同时有多笔金额相同的记录
-duplicate_records = merged_df.groupby(['金额', '交易时间']).filter(lambda x: len(x) > 1)
+duplicate_records = merged_df.groupby(["金额", "交易时间"]).filter(lambda x: len(x) > 1)
 # 将结果保存为新的Excel文件
-output_file_path = '/home/baol/tools/case2_records.xlsx'  # 替换为你希望保存的文件路径
+output_file_path = "/home/baol/tools/case2_records.xlsx"  # 替换为你希望保存的文件路径
-duplicate_records.to_excel(output_file_path, index=False, engine='openpyxl')
+duplicate_records.to_excel(output_file_path, index=False, engine="openpyxl")
--- a/case_int.py
+++ b/case_int.py
@ -0,0 +1,99 @@
 import pandas as pd
 # 1. 读取 Excel 文件中的数据
 df_case = pd.read_excel(
    "/home/baol/tools/数据草稿-案款1.xls", sheet_name="一案一账户收"
 )
 df_bank = pd.read_excel(
    "/home/baol/tools/数据草稿-案款1.xls", sheet_name="银行-收入", header=1
 )
 # 2. 保留指定的列
 # 在“一案一账户收”中，保留归属本案金额、到账金额、到账日期、缴款人、案号
 df_case = df_case[["归属本案金额", "到账金额", "到账日期", "缴款人", "案号"]]
 # 在“银行-收入”中，保留对方户名、交易时间、金额
 df_bank = df_bank[["对方户名", "交易时间", "金额"]]
 # 3. 统一日期格式为 yyyy-mm-dd
 df_case["到账日期"] = pd.to_datetime(df_case["到账日期"]).dt.strftime("%Y-%m-%d")
 df_bank["交易时间"] = pd.to_datetime(df_bank["交易时间"]).dt.strftime("%Y-%m-%d")
 # 4. 任务1：检索到账金额与金额匹配，且到账日期与交易时间匹配的记录，但排除在同一天有多笔金额相同的情况
 # 合并两个数据框，匹配到账金额=金额，到账日期=交易时间
 df_merged = pd.merge(
    df_case,
    df_bank,
    left_on=["到账金额", "到账日期"],
    right_on=["金额", "交易时间"],
    how="inner",
 )
 # 计算每个日期和金额组合的出现次数
 case_counts = (
    df_case.groupby(["到账日期", "到账金额"]).size().reset_index(name="case_count")
 )
 bank_counts = (
    df_bank.groupby(["交易时间", "金额"]).size().reset_index(name="bank_count")
 )
 # 合并统计数据
 df_merged = pd.merge(
    df_merged,
    case_counts,
    left_on=["到账日期", "到账金额"],
    right_on=["到账日期", "到账金额"],
 )
 df_merged = pd.merge(
    df_merged, bank_counts, left_on=["交易时间", "金额"], right_on=["交易时间", "金额"]
 )
 # 过滤出在同一天没有多笔金额相同的记录（即计数为1）
 task1_result = df_merged[
    (df_merged["case_count"] == 1) & (df_merged["bank_count"] == 1)
 ]
 # 获取任务1中“一案一账户收”和“银行-收入”的索引
 task1_case_indices = df_case[
    df_case[["到账金额", "到账日期"]]
    .apply(tuple, axis=1)
    .isin(task1_result[["到账金额", "到账日期"]].apply(tuple, axis=1))
 ].index
 task1_bank_indices = df_bank[
    df_bank[["金额", "交易时间"]]
    .apply(tuple, axis=1)
    .isin(task1_result[["金额", "交易时间"]].apply(tuple, axis=1))
 ].index
 # 5. 任务2：检索出在某一天有多笔金额相同的记录
 # 找出“一案一账户收”中同一天金额相同且次数大于1的记录
 task2_case = df_case.groupby(["到账日期", "到账金额"]).filter(lambda x: len(x) > 1)
 task2_case_indices = task2_case.index
 # 找出“银行-收入”中同一天金额相同且次数大于1的记录
 task2_bank = df_bank.groupby(["交易时间", "金额"]).filter(lambda x: len(x) > 1)
 task2_bank_indices = task2_bank.index
 # 6. 任务3：检索出不符合任务1和任务2的记录
 # 在“一案一账户收”中，获取不在任务1和任务2中的索引
 task3_case_indices = df_case.index.difference(
    task1_case_indices.union(task2_case_indices)
 )
 task3_case_result = df_case.loc[task3_case_indices]
 # 在“银行-收入”中，获取不在任务1和任务2中的索引
 task3_bank_indices = df_bank.index.difference(
    task1_bank_indices.union(task2_bank_indices)
 )
 task3_bank_result = df_bank.loc[task3_bank_indices]
 # 7. 将结果保存到 Excel 文件
 with pd.ExcelWriter("/home/baol/tools/任务结果.xlsx") as writer:
    task1_result.to_excel(writer, sheet_name="任务1", index=False)
    task2_case.to_excel(writer, sheet_name="任务2_一案一账户收", index=False)
    task2_bank.to_excel(writer, sheet_name="任务2_银行收入", index=False)
    task3_case_result.to_excel(writer, sheet_name="任务3_一案一账户收", index=False)
    task3_bank_result.to_excel(writer, sheet_name="任务3_银行收入", index=False)
 print("所有任务结果已保存到 '任务结果.xlsx' 文件中。")
--- a/case_out.py
+++ b/case_out.py
@ -0,0 +1,104 @@
 import pandas as pd
 # 读取Excel文件的指定工作表和列
 df1 = pd.read_excel(
    "/home/baol/tools/数据草稿-案款1.xls",
    sheet_name="一案一账户支",
    usecols=["支款案号", "来源案号", "申请人", "支付日期", "支付金额", "领款人"],
 )
 df2 = pd.read_excel(
    "/home/baol/tools/数据草稿-案款1.xls",
    sheet_name="银行支出",
    usecols=["对方户名", "交易时间", "支出金额"],
    header=1,
 )
 # 标准化日期格式为 yyyy-mm-dd
 df1["支付日期"] = pd.to_datetime(df1["支付日期"]).dt.strftime("%Y-%m-%d")
 df2["交易时间"] = pd.to_datetime(df2["交易时间"]).dt.strftime("%Y-%m-%d")
 # 确保金额为数值类型
 df1["支付金额"] = pd.to_numeric(df1["支付金额"], errors="coerce")
 df2["支出金额"] = pd.to_numeric(df2["支出金额"], errors="coerce")
 # 初始化任务标记列
 df1["任务"] = "任务3"
 df2["任务"] = "任务3"
 # 任务2：查找在某一天有多笔金额相同的记录
 # 标记df1中的重复记录
 df1_duplicates = df1[df1.duplicated(subset=["支付日期", "支付金额"], keep=False)]
 df1.loc[df1_duplicates.index, "任务"] = "任务2"
 # 标记df2中的重复记录
 df2_duplicates = df2[df2.duplicated(subset=["交易时间", "支出金额"], keep=False)]
 df2.loc[df2_duplicates.index, "任务"] = "任务2"
 # 为匹配准备唯一记录（任务1）
 df1_unique = (
    df1[df1["任务"] == "任务3"].reset_index().rename(columns={"index": "df1_index"})
 )
 df2_unique = (
    df2[df2["任务"] == "任务3"].reset_index().rename(columns={"index": "df2_index"})
 )
 # 创建辅助列用于匹配
 df1_unique["日期"] = df1_unique["支付日期"]
 df1_unique["金额"] = df1_unique["支付金额"]
 df1_unique["名称"] = df1_unique["领款人"]
 df2_unique["日期"] = df2_unique["交易时间"]
 df2_unique["金额"] = df2_unique["支出金额"]
 df2_unique["名称"] = df2_unique["对方户名"]
 # 条件1：支付金额与支出金额匹配，支付日期与交易时间匹配
 condition1_matches = pd.merge(
    df1_unique, df2_unique, on=["日期", "金额"], how="inner", suffixes=("_df1", "_df2")
 )
 df1.loc[condition1_matches["df1_index"], "任务"] = "任务1"
 df2.loc[condition1_matches["df2_index"], "任务"] = "任务1"
 # 条件2：支付金额与支出金额匹配，领款人与对方户名匹配
 condition2_matches = pd.merge(
    df1_unique, df2_unique, on=["金额", "名称"], how="inner", suffixes=("_df1", "_df2")
 )
 df1.loc[condition2_matches["df1_index"], "任务"] = "任务1"
 df2.loc[condition2_matches["df2_index"], "任务"] = "任务1"
 # 合并条件匹配的结果
 task1_matches = pd.concat([condition1_matches, condition2_matches], ignore_index=True)
 task1_matches = task1_matches.drop_duplicates(subset=["df1_index", "df2_index"])
 # 任务3：提取不符合任务1和任务2的记录
 task3_df1 = df1[df1["任务"] == "任务3"]
 task3_df2 = df2[df2["任务"] == "任务3"]
 # 创建一个新的Excel文件，保存结果
 with pd.ExcelWriter("/home/baol/tools/匹配结果.xlsx") as writer:
    # 任务1结果
    task1_result = task1_matches[
        [
            "支款案号",
            "来源案号",
            "申请人",
            "支付日期",
            "支付金额",
            "领款人",
            "对方户名",
            "交易时间",
            "支出金额",
        ]
    ]
    task1_result.to_excel(writer, sheet_name="任务1匹配结果", index=False)
    # 任务2结果
    df1_duplicates.to_excel(
        writer, sheet_name="任务2_一案一账户支重复记录", index=False
    )
    df2_duplicates.to_excel(writer, sheet_name="任务2_银行支出重复记录", index=False)
    # 任务3结果
    task3_df1.to_excel(writer, sheet_name="任务3_一案一账户支未匹配记录", index=False)
    task3_df2.to_excel(writer, sheet_name="任务3_支出未匹配记录", index=False)
 print("结果已保存到 '匹配结果.xlsx' 文件中。")